In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

In [None]:
class Node:
    def __init__(self, feature, threshold, left_child=None, right_child=None, is_leaf=False):
        self.feature = feature
        self.threshold = threshold
        self.left_child = left_child
        self.right_child = right_child
        self.is_leaf = is_leaf
    
    def forward(self, x):
        if x[self.feature] < self.threshold:
            if self.is_leaf:
                return self.left_child
            return self.left_child.forward(x)
        else:
            if self.is_leaf:
                return self.right_child
            return self.right_child.forward(x)

In [79]:
from pyparsing import null_debug_action


class MyDecisionTree:
    def __init__(self):
        pass
    
    def fit(self, X, Y):
        scores = []
        for column in X.columns:
            X_train = X[column].to_numpy()
            X_train = X_train.reshape(X_train.shape[0], 1)
            clf = LogisticRegression(random_state=0)
            clf.fit(X_train, Y)
            score = clf.score(X_train, Y)
            scores.append(score)
        scores = np.array(scores)
        best_feature = X.columns[np.argmax(scores)]
        print(best_feature)
        print(scores)

    def do_split(self, X, thresh):
        """
            Split the data at a node based on threshold
        """

        left_child_ids = np.where(X<=thresh).flatten()
        right_child_ids = np.where(X>thresh).flatten()
        return left_child_ids, right_child_ids
    
    def find_entropy(self, y):
        probs = []
        for i in range(len(y)):
            probs.append(y[i]/len(y))
        
        entropy = 0
        for i in range(len(probs)):
            entropy -= probs[i]*np.log2(p)
        
        return entropy

    def get_partition(self, X, Y, feature, threshold):
        '''
            This function should return left and right
            partitions according to appropritate
            partitioning algorithm. Return None if all
            data has same label
        '''

        if(len(np.unique(Y)) == 1):
            return None
        
        best_info_gain = -1
        best_thresh = threshold[0]
        for thresh in threshold:

            left_child_ids, right_child_ids = self.do_split(X, thresh)
            parent_pts = len(X)
            left_child_pts = len(left_child_ids)
            right_child_pts = len(right_child_ids)

            info_gain = 1 - left_child_pts/parent_pts*self.find_entropy(Y[left_child_ids]) - right_child_pts/parent_pts*self.find_entropy(Y[right_child_ids])

            if(info_gain > best_info_gain):

                best_info_gain = info_gain
                best_thresh = thresh
        
        # partition according to best threshold
        best_left_ids, best_right_ids = self.do_split(X, best_thresh)
        return (X[best_left_ids], Y[best_left_ids]), (X[best_right_ids], Y[best_right_ids]), best_thresh

        pass

In [None]:
df = pd.read_csv('data/preprocessed_cancer.csv')

In [None]:
print(Y.shape)

In [None]:
dt = MyDecisionTree()
X = df.loc[:, df.columns != 'Biopsy']
Y = df['Biopsy'].to_numpy()
dt.fit(X, Y)

In [72]:
iris = load_iris()
X_iris = iris.data
Y_iris = iris.target
X_iris = pd.DataFrame(X_iris)
dt_iris = MyDecisionTree()
dt_iris.fit(X_iris, Y_iris)

3
[0.74666667 0.55333333 0.95333333 0.96      ]
