In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from treelib import Tree


In [2]:
class Node:
    def __init__(self, name, feature=None, threshold=None, left_child=None, right_child=None, is_leaf=False, value=-1):
        self.feature = feature
        self.threshold = threshold
        self.left_child = left_child
        self.right_child = right_child
        self.is_leaf = is_leaf
        self.value = value
        self.name = name
    
    def forward(self, x):
        # print(self.feature)
        if x[self.feature] < self.threshold:
            if self.is_leaf:
                return self.value
            return self.left_child.forward(x)
        else:
            if self.is_leaf:
                return self.value
            return self.right_child.forward(x)

In [3]:
class MyDecisionTree:
    def __init__(self, min_samples=1):
        self.root_node = Node('root')
        self.node_count = 0
        self.min_samples = min_samples
    
    def predict(self, X, cols_d):
        
        Y_pred = []
        for i in range(len(X)):

            x_i = X[i]
            # print(x_i[0])
            curr_node = self.root_node
            predicted_y = np.array(self.predict_util(x_i, curr_node, cols_d))
            # print(predicted_y)
            Y_pred.append(predicted_y)

        return np.array(Y_pred)

    def predict_util(self, x, curr_node, cols_d):
        
        print(curr_node.feature)
        if(curr_node.is_leaf):
            return curr_node.value
        if x[cols_d[curr_node.feature]] <= curr_node.threshold:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.left_child, cols_d)
        else:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.right_child, cols_d)

    def fit(self, X, Y):
        self.fit_util(X, Y, self.root_node)


    def fit_util(self, X, Y, current_node):
        if np.unique(Y).shape[0] == 1:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        scores = []
        for column in X.columns:
            X_train = X[column].to_numpy()
            X_train = X_train.reshape(X_train.shape[0], 1)
            clf = LogisticRegression(random_state=0, max_iter=100)
            clf.fit(X_train, Y)
            score = clf.score(X_train, Y)
            scores.append(score)
        scores = np.array(scores)
        best_feature = X.columns[np.argmax(scores)]
        best_feature_values = np.unique(X[best_feature])
        best_feature_values.sort()

        partition = self.get_partition(X, Y, best_feature, best_feature_values)
        
        if partition == None:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        (X_left, Y_left), (X_right, Y_right), threshold = partition
        current_node.threshold = threshold
        current_node.feature = best_feature
        current_node.name = f'{best_feature} {threshold} {self.node_count}'
        self.node_count += 1
        current_node.left_child = Node('No name')
        current_node.right_child = Node('No name')

        self.fit_util(X_left, Y_left, current_node.left_child)
        self.fit_util(X_right, Y_right, current_node.right_child)

        

    def do_split(self, X, thresh):
        """
            Split the data at a node based on threshold
        """

        left_child_ids = np.where(X <= thresh, True, False)
        right_child_ids = np.where(X > thresh, True, False)
        return left_child_ids, right_child_ids
    
    def find_entropy(self, Y):
        probs = []
        possible_classes, counts = np.unique(Y, return_counts=True)
        sort_indices = np.argsort(possible_classes)
        possible_classes = possible_classes[sort_indices]
        counts = counts[sort_indices]
        
        for class_label, count in zip(possible_classes, counts):
            probs.append(count/Y.shape[0])
        
        entropy = 0
        for prob in probs:
            entropy -= prob*np.log2(prob)
        
        return entropy

    def get_partition(self, X, Y, feature, thresholds):
        '''
            This function should return left and right
            partitions according to appropritate
            partitioning algorithm. Return None if all
            data has same label
        '''
        
        # if only 1 class available at a node OR MIN_SAMPLES left at a node - Leaf Node reached
        if(len(Y) < self.min_samples):
            return None
        
        best_info_gain = -float('inf')
        best_thresh = thresholds[0]        
        for thresh in thresholds[:-1]:

            left_child_ids, right_child_ids = self.do_split(X[feature].to_numpy(), thresh)

            parent_pts = X.shape[0]
            left_child_pts = len(left_child_ids)
            right_child_pts = len(right_child_ids)

            info_gain = self.find_entropy(Y) - (left_child_pts / parent_pts) * self.find_entropy(Y[left_child_ids]) - (right_child_pts / parent_pts) * self.find_entropy(Y[right_child_ids])

            if(info_gain > best_info_gain):

                best_info_gain = info_gain
                best_thresh = thresh
        
        # partition according to best threshold
        best_left_ids, best_right_ids = self.do_split(X[feature].to_numpy(), best_thresh)
        print(len(Y[best_left_ids]), len(Y[best_right_ids]))
        if(len(Y[best_left_ids]) == 0 or len(Y[best_right_ids]) == 0):
            print(len(Y[best_left_ids]), len(Y[best_right_ids]))
            return None
        return (X[best_left_ids], Y[best_left_ids]), (X[best_right_ids], Y[best_right_ids]), best_thresh
    
    def print_tree(self):
        tree = Tree()
        self.print_tree_util(self.root_node, tree)
        tree.show()
        return tree

    def print_tree_util(self, root, tree, parent=None):
        if parent is not None:
            tree.create_node(root.name, root.name, parent=parent.name)
        else:
            tree.create_node(root.name, root.name)
        if root.is_leaf:
            return
        self.print_tree_util(root.left_child, tree, root)
        self.print_tree_util(root.right_child, tree, root)


In [4]:
df = pd.read_csv('data/preprocessed_cancer.csv')

In [5]:
df.columns

Index(['Unnamed: 0', 'Age', 'Number of sexual partners',
       'First sexual intercourse', 'Num of pregnancies', 'Smokes',
       'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives',
       'Hormonal Contraceptives (years)', 'IUD', 'IUD (years)', 'STDs',
       'STDs (number)', 'STDs:condylomatosis', 'STDs:cervical condylomatosis',
       'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
       'STDs:syphilis', 'STDs:pelvic inflammatory disease',
       'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS',
       'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology', 'Biopsy'],
      dtype='object')

In [6]:
from matplotlib import test


dt = MyDecisionTree(min_samples = 686)
X = df.drop(['Unnamed: 0', 'Biopsy'], axis=1)
Y = df['Biopsy'].to_numpy()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X_train.shape)
dt.fit(X_train, Y_train)

(686, 35)
629 57


In [64]:
unq, counts = np.unique(Y_test, return_counts=True)
print(counts)

[161  11]


In [7]:
tree = dt.print_tree()

Schiller 0 0
├── leaf 0 1
└── leaf 1 2



In [32]:
digits = load_digits()
X_digits, Y_digits = digits.data, digits.target
X_digits = X_digits/255
X_digits_train,X_digits_test,Y_digits_train,Y_digits_test=train_test_split(X_digits,Y_digits,test_size=0.2,random_state=42)
X_digits_train = pd.DataFrame(X_digits_train)
X_digits_test = pd.DataFrame(X_digits_test)

print(X_digits_train.shape)
print(X_digits_test.shape)

# X_iris = pd.DataFrame(X_iris)
# print(X_iris.shape)

(1437, 64)
(360, 64)


In [33]:
dt_digits = MyDecisionTree(min_samples=1)
dt_digits.fit(X_digits_train, Y_digits_train)
tree = dt_digits.print_tree()

372 1065
372 0
372 0
249 816
249 0
249 0
276 540
276 0
276 0
540 0
540 0
34 0.0 0
├── 53 0.0 2
│   ├── 21 0.0 4
│   │   ├── leaf 0 6
│   │   └── leaf 6 5
│   └── leaf 7 3
└── leaf 3 1



In [34]:
pred_y_digits = dt_digits.predict(X_digits_test.to_numpy()).squeeze()
# print(pred_y_digits)

accuracy = (pred_y_digits == Y_digits_test).sum() / Y_digits_test.shape[0]
print(accuracy)

0.3472222222222222


In [37]:
tree.to_graphviz()

digraph tree {
	"3 0.6 0" [label="3 0.6 0", shape=circle]
	"3 1.7 2" [label="3 1.7 2", shape=circle]
	"leaf 0 1" [label="leaf 0 1", shape=circle]
	"0 5.6 20" [label="0 5.6 20", shape=circle]
	"2 5.1 3" [label="2 5.1 3", shape=circle]
	"0 5.7 22" [label="0 5.7 22", shape=circle]
	"leaf 2 21" [label="leaf 2 21", shape=circle]
	"0 6.9 4" [label="0 6.9 4", shape=circle]
	"leaf 2 19" [label="leaf 2 19", shape=circle]
	"0 5.8 24" [label="0 5.8 24", shape=circle]
	"leaf 2 23" [label="leaf 2 23", shape=circle]
	"0 6.7 5" [label="0 6.7 5", shape=circle]
	"leaf 1 18" [label="leaf 1 18", shape=circle]
	"0 7.6 26" [label="0 7.6 26", shape=circle]
	"leaf 2 25" [label="leaf 2 25", shape=circle]
	"0 6.6 6" [label="0 6.6 6", shape=circle]
	"leaf 1 17" [label="leaf 1 17", shape=circle]
	"leaf 2 27" [label="leaf 2 27", shape=circle]
	"leaf 2 28" [label="leaf 2 28", shape=circle]
	"0 6.5 7" [label="0 6.5 7", shape=circle]
	"leaf 1 16" [label="leaf 1 16", shape=circle]
	"0 6.4 8" [label="0 6.4 8", shape=c

In [62]:

print(df.to_numpy())
cols = df.columns
cols_d = {}
id=0
for col in cols:
    cols_d[col] = id
    id+=1
Y_pred = dt.predict(np.array(X_test.to_numpy()), cols_d)
Y_pred

[[  0.  18.   4. ...   0.   0.   0.]
 [  1.  15.   1. ...   0.   0.   0.]
 [  2.  34.   1. ...   0.   0.   0.]
 ...
 [855.  25.   2. ...   0.   1.   0.]
 [856.  33.   2. ...   0.   0.   0.]
 [857.  29.   2. ...   0.   0.   0.]]
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Sc

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [63]:
accuracy = (Y_pred == Y_test).sum() / Y_test.shape[0]
print(accuracy)

0.9302325581395349


In [65]:
161/172

0.936046511627907