In [19]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from treelib import Node, Tree

In [20]:
class Node:
    def __init__(self, name, feature=None, threshold=None, left_child=None, right_child=None, is_leaf=False, value=-1):
        self.feature = feature
        self.threshold = threshold
        self.left_child = left_child
        self.right_child = right_child
        self.is_leaf = is_leaf
        self.value = value
        self.name = name
    
    def forward(self, x):
        if x[self.feature] < self.threshold:
            if self.is_leaf:
                return self.left_child
            return self.left_child.forward(x)
        else:
            if self.is_leaf:
                return self.right_child
            return self.right_child.forward(x)

In [54]:
class MyDecisionTree:
    def __init__(self, min_samples=1):
        self.root_node = Node('root')
        self.node_count = 0
        self.min_samples = min_samples
    
    def fit(self, X, Y):
        self.fit_util(X, Y, self.root_node)


    def fit_util(self, X, Y, current_node):
        if np.unique(Y).shape[0] == 1:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        scores = []
        for column in X.columns:
            X_train = X[column].to_numpy()
            X_train = X_train.reshape(X_train.shape[0], 1)
            clf = LogisticRegression(random_state=0)
            clf.fit(X_train, Y)
            score = clf.score(X_train, Y)
            scores.append(score)
        scores = np.array(scores)
        best_feature = X.columns[np.argmax(scores)]
        best_feature_values = np.unique(X[best_feature])
        best_feature_values.sort()

        partition = self.get_partition(X, Y, best_feature, best_feature_values)
        
        if partition == None:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        (X_left, Y_left), (X_right, Y_right), threshold = partition
        current_node.threshold = threshold
        current_node.feature = best_feature
        current_node.name = f'{best_feature} {threshold} {self.node_count}'
        self.node_count += 1
        current_node.left_child = Node('unnamed')
        current_node.right_child = Node('unnamed')

        self.fit_util(X_left, Y_left, current_node.left_child)
        self.fit_util(X_right, Y_right, current_node.right_child)

        

    def do_split(self, X, thresh):
        """
            Split the data at a node based on threshold
        """

        left_child_ids = np.where(X <= thresh, True, False)
        right_child_ids = np.where(X > thresh, True, False)
        return left_child_ids, right_child_ids
    
    def find_entropy(self, Y):
        probs = []
        possible_classes, counts = np.unique(Y, return_counts=True)
        sort_indices = np.argsort(possible_classes)
        possible_classes = possible_classes[sort_indices]
        counts = counts[sort_indices]
        
        for class_label, count in zip(possible_classes, counts):
            probs.append(count/Y.shape[0])
        
        entropy = 0
        for prob in probs:
            entropy -= prob*np.log2(prob)
        
        return entropy

    def get_partition(self, X, Y, feature, thresholds):
        '''
            This function should return left and right
            partitions according to appropritate
            partitioning algorithm. Return None if all
            data has same label
        '''

        # if only 1 class available at a node OR MIN_SAMPLES left at a node - Leaf Node reached
        if(len(Y) < self.min_samples):
            return None
        
        best_info_gain = -float('inf')
        best_thresh = thresholds[0]        
        for thresh in thresholds[:-1]:

            left_child_ids, right_child_ids = self.do_split(X[feature].to_numpy(), thresh)

            parent_pts = X.shape[0]
            left_child_pts = len(left_child_ids)
            right_child_pts = len(right_child_ids)

            info_gain = self.find_entropy(Y) - (left_child_pts / parent_pts) * self.find_entropy(Y[left_child_ids]) - (right_child_pts / parent_pts) * self.find_entropy(Y[right_child_ids])

            if(info_gain > best_info_gain):

                best_info_gain = info_gain
                best_thresh = thresh
        
        # partition according to best threshold
        best_left_ids, best_right_ids = self.do_split(X[feature].to_numpy(), best_thresh)
        if(len(Y[best_left_ids]) == 0 or len(Y[best_right_ids]) == 0):
            print(len(Y[best_left_ids]), len(Y[best_right_ids]))
            return None
        return (X[best_left_ids], Y[best_left_ids]), (X[best_right_ids], Y[best_right_ids]), best_thresh
    
    def print_tree(self):
        tree = Tree()
        self.print_tree_util(self.root_node, tree)
        tree.show()
        return tree

    def print_tree_util(self, root, tree, parent=None):
        if parent is not None:
            tree.create_node(root.name, root.name, parent=parent.name)
        else:
            tree.create_node(root.name, root.name)
        if root.is_leaf:
            return
        self.print_tree_util(root.left_child, tree, root)
        self.print_tree_util(root.right_child, tree, root)


In [None]:
df = pd.read_csv('data/preprocessed_cancer.csv')

In [None]:
print(Y.shape)

In [None]:
dt = MyDecisionTree()
X = df.loc[:, df.columns != 'Biopsy']
Y = df['Biopsy'].to_numpy()
dt.fit(X, Y)

In [57]:
iris = load_iris()
X_iris = iris.data
Y_iris = iris.target
X_iris = pd.DataFrame(X_iris)
dt_iris = MyDecisionTree(min_samples=10)
dt_iris.fit(X_iris, Y_iris)
tree = dt_iris.print_tree()

3 0.6 0
├── 3 1.7 2
│   ├── 0 5.6 34
│   │   ├── 0 5.7 36
│   │   │   ├── 0 7.7 38
│   │   │   │   ├── 0 5.8 39
│   │   │   │   │   ├── 0 7.6 41
│   │   │   │   │   │   ├── 0 7.4 42
│   │   │   │   │   │   │   ├── 0 7.3 43
│   │   │   │   │   │   │   │   ├── 0 7.2 44
│   │   │   │   │   │   │   │   │   ├── 0 7.1 45
│   │   │   │   │   │   │   │   │   │   ├── 0 6.9 46
│   │   │   │   │   │   │   │   │   │   │   ├── 0 6.8 47
│   │   │   │   │   │   │   │   │   │   │   │   ├── 0 6.7 48
│   │   │   │   │   │   │   │   │   │   │   │   │   ├── 0 6.5 49
│   │   │   │   │   │   │   │   │   │   │   │   │   │   ├── 0 6.4 50
│   │   │   │   │   │   │   │   │   │   │   │   │   │   │   ├── 0 6.3 51
│   │   │   │   │   │   │   │   │   │   │   │   │   │   │   │   ├── 0 6.2 52
│   │   │   │   │   │   │   │   │   │   │   │   │   │   │   │   │   ├── leaf 2 53
│   │   │   │   │   │   │   │   │   │   │   │   │   │   │   │   │   └── leaf 2 54
│   │   │   │   │   │   │   │   │   │   │   │   │   │   │   │   

In [58]:
tree.to_graphviz()

digraph tree {
	"3 0.6 0" [label="3 0.6 0", shape=circle]
	"3 1.7 2" [label="3 1.7 2", shape=circle]
	"leaf 0 1" [label="leaf 0 1", shape=circle]
	"0 5.6 34" [label="0 5.6 34", shape=circle]
	"2 5.1 3" [label="2 5.1 3", shape=circle]
	"0 5.7 36" [label="0 5.7 36", shape=circle]
	"leaf 2 35" [label="leaf 2 35", shape=circle]
	"0 6.9 4" [label="0 6.9 4", shape=circle]
	"leaf 2 33" [label="leaf 2 33", shape=circle]
	"0 7.7 38" [label="0 7.7 38", shape=circle]
	"leaf 2 37" [label="leaf 2 37", shape=circle]
	"0 6.8 5" [label="0 6.8 5", shape=circle]
	"leaf 1 32" [label="leaf 1 32", shape=circle]
	"0 5.8 39" [label="0 5.8 39", shape=circle]
	"leaf 2 66" [label="leaf 2 66", shape=circle]
	"0 6.7 6" [label="0 6.7 6", shape=circle]
	"leaf 1 31" [label="leaf 1 31", shape=circle]
	"0 7.6 41" [label="0 7.6 41", shape=circle]
	"leaf 2 40" [label="leaf 2 40", shape=circle]
	"0 6.6 7" [label="0 6.6 7", shape=circle]
	"leaf 1 30" [label="leaf 1 30", shape=circle]
	"0 7.4 42" [label="0 7.4 42", shape=c