In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from treelib import Tree
import operator
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from scipy.stats import t as t_dist
from scipy.stats import norm, chi2
import pickle

In [17]:
class Node:
    def __init__(self, name, feature=None, threshold=None, left_child=None, right_child=None, is_leaf=False, value=-1, depth=-1):
        self.feature = feature
        self.threshold = threshold
        self.left_child = left_child
        self.right_child = right_child
        self.is_leaf = is_leaf
        self.value = value
        self.name = name
        self.depth = depth

In [60]:
class MyDecisionTree:
    def __init__(self, min_samples=1, max_depth=-1, max_thresholds=None, min_unique_values=None, num_random_columns=None):
        self.root_node = Node('root', depth=0)
        self.node_count = 0
        self.min_samples = min_samples
        self.max_depth = max_depth
        self.max_thresholds = max_thresholds
        self.min_unique_values = min_unique_values
        self.num_random_columns = num_random_columns
    
    def predict(self, X, cols_d):
        
        Y_pred = []
        for i in range(len(X)):

            x_i = X[i]
            # print(x_i[0])
            curr_node = self.root_node
            predicted_y = np.array(self.predict_util(x_i, curr_node, cols_d))
            # print(predicted_y)
            Y_pred.append(predicted_y)

        return np.array(Y_pred)

    def predict_util(self, x, curr_node, cols_d):
        
        # print(curr_node.feature)
        if(curr_node.is_leaf):
            return curr_node.value
        if x[cols_d[curr_node.feature]] <= curr_node.threshold:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.left_child, cols_d)
        else:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.right_child, cols_d)

    def fit(self, X, Y):
        self.fit_util(X, Y, self.root_node)


    def fit_util(self, X, Y, current_node):
        if np.unique(Y).shape[0] == 1:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        scores = []
        num_random_columns = len(X.columns)
        if self.num_random_columns is not None:
            num_random_columns = self.num_random_columns
        random_columns = X.columns.to_numpy()[np.random.permutation(len(X.columns))[:num_random_columns]]
        print(random_columns)
        for column in random_columns:
            X_train = X[column].to_numpy()
            X_train = X_train.reshape(X_train.shape[0], 1)
            clf = LogisticRegression(random_state=0, max_iter=100)
            clf.fit(X_train, Y)
            score = clf.score(X_train, Y)
            scores.append(score)
        scores = np.array(scores)
        best_feature = X.columns[np.argmax(scores)]
        best_feature_values = np.unique(X[best_feature])
        best_feature_values.sort()

        partition = self.get_partition(X, Y, best_feature, best_feature_values, current_node.depth)
        
        if partition == None:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        (X_left, Y_left), (X_right, Y_right), threshold = partition
        current_node.threshold = threshold
        current_node.feature = best_feature
        current_node.name = f'{best_feature} {threshold} {self.node_count}'
        self.node_count += 1
        current_node.left_child = Node('No name', depth=current_node.depth + 1)
        current_node.right_child = Node('No name', depth=current_node.depth + 1)

        self.fit_util(X_left, Y_left, current_node.left_child)
        self.fit_util(X_right, Y_right, current_node.right_child)

        

    def do_split(self, X, thresh):
        """
            Split the data at a node based on threshold
        """

        left_child_ids = np.where(X <= thresh, True, False)
        right_child_ids = np.where(X > thresh, True, False)
        return left_child_ids, right_child_ids
    
    def find_entropy(self, Y):
        probs = []
        possible_classes, counts = np.unique(Y, return_counts=True)
        sort_indices = np.argsort(possible_classes)
        possible_classes = possible_classes[sort_indices]
        counts = counts[sort_indices]
        
        for class_label, count in zip(possible_classes, counts):
            probs.append(count/Y.shape[0])
        
        entropy = 0
        for prob in probs:
            entropy -= prob*np.log2(prob)
        
        return entropy

    def get_partition(self, X, Y, feature, thresholds, current_node_depth):
        '''
            This function should return left and right
            partitions according to appropritate
            partitioning algorithm. Return None if all
            data has same label
        '''
        
        # if only 1 class available at a node OR MIN_SAMPLES left at a node - Leaf Node reached
        if(len(Y) < self.min_samples):
            return None
        if(current_node_depth == self.max_depth):
            return None
        
        best_info_gain = -float('inf')
        best_thresh = thresholds[0]      
        threshold_cnt = len(thresholds)  
        if self.max_thresholds is not None:
            threshold_cnt = self.max_thresholds
        min_unique_values = len(thresholds)

        if self.min_unique_values is not None:
            min_unique_values = self.min_unique_values
        
        print("Now thresholding!")
        for thresh in (thresholds[:-1] if len(thresholds) <= min_unique_values else thresholds[:-1:len(thresholds) // threshold_cnt]):

            left_child_ids, right_child_ids = self.do_split(X[feature].to_numpy(), thresh)

            parent_pts = X.shape[0]
            left_child_pts = len(left_child_ids)
            right_child_pts = len(right_child_ids)

            info_gain = self.find_entropy(Y) - (left_child_pts / parent_pts) * self.find_entropy(Y[left_child_ids]) - (right_child_pts / parent_pts) * self.find_entropy(Y[right_child_ids])

            if(info_gain > best_info_gain):

                best_info_gain = info_gain
                best_thresh = thresh
        
        # partition according to best threshold
        best_left_ids, best_right_ids = self.do_split(X[feature].to_numpy(), best_thresh)
        # print(len(Y[best_left_ids]), len(Y[best_right_ids]))
        if(len(Y[best_left_ids]) == 0 or len(Y[best_right_ids]) == 0):
            # print(len(Y[best_left_ids]), len(Y[best_right_ids]))
            return None
        return (X[best_left_ids], Y[best_left_ids]), (X[best_right_ids], Y[best_right_ids]), best_thresh
    
    def print_tree(self):
        tree = Tree()
        self.print_tree_util(self.root_node, tree)
        tree.show()
        return tree

    def print_tree_util(self, root, tree, parent=None):
        if parent is not None:
            tree.create_node(root.name, root.name, parent=parent.name)
        else:
            tree.create_node(root.name, root.name)
        if root.is_leaf:
            return
        self.print_tree_util(root.left_child, tree, root)
        self.print_tree_util(root.right_child, tree, root)


In [6]:
class Node2:
    def __init__(self, name, feature=None, threshold=None, left_child1=None, left_child2=None, right_child1=None, right_child2=None, is_leaf=False, value=-1, depth=-1):
        self.feature = feature
        self.threshold = threshold
        self.left_child1 = left_child1
        self.left_child2 = left_child2
        self.right_child1 = right_child1
        self.right_child2 = right_child2
        self.is_leaf = is_leaf
        self.value = value
        self.name = name
        self.depth = depth
    

In [30]:
class MyDecisionTree2:
    def __init__(self, min_samples=1, max_depth=-1, max_thresholds=None, min_unique_values=None, num_random_columns=None):
        self.root_node = Node2('root', depth=0)
        self.node_count = 0
        self.min_samples = min_samples
        self.max_depth = max_depth
        self.max_thresholds = max_thresholds
        self.min_unique_values = min_unique_values
        self.num_random_columns = num_random_columns
    
    def predict(self, X, cols_d):
        
        Y_pred = []
        for i in range(len(X)):

            x_i = X[i]
            # print(x_i[0])
            curr_node = self.root_node
            predicted_y = np.array(self.predict_util(x_i, curr_node, cols_d))
            # print(predicted_y)
            Y_pred.append(predicted_y)

        return np.array(Y_pred)

    def predict_util(self, x, curr_node, cols_d):
        
        if(curr_node.is_leaf):
            return curr_node.value
        if x[cols_d[curr_node.feature[0]]] <= curr_node.threshold[0] and x[cols_d[curr_node.feature[1]]] <= curr_node.threshold[1]:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.left_child1, cols_d)

        if x[cols_d[curr_node.feature[0]]] > curr_node.threshold[0] and x[cols_d[curr_node.feature[1]]] <= curr_node.threshold[1]:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.left_child2, cols_d)

        if x[cols_d[curr_node.feature[0]]] <= curr_node.threshold[0] and x[cols_d[curr_node.feature[1]]] > curr_node.threshold[1]:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.right_child1, cols_d)

        elif x[cols_d[curr_node.feature[0]]] > curr_node.threshold[0] and x[cols_d[curr_node.feature[1]]] > curr_node.threshold[1]:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.right_child2, cols_d)

    def fit(self, X, Y):
        self.fit_util(X, Y, self.root_node)


    def fit_util(self, X, Y, current_node):
        if np.unique(Y).shape[0] == 1:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        scores = {}
        num_random_columns = len(X.columns)
        if self.num_random_columns is not None:
            num_random_columns = self.num_random_columns
        random_columns = X.columns.to_numpy()[np.random.permutation(len(X.columns))[:num_random_columns]]
        
        for column1 in random_columns:
            for column2 in random_columns:
                column = [column1, column2]
                X_train = X[column].to_numpy()
                X_train = X_train.reshape(X_train.shape[0], 2)
                clf = LogisticRegression(random_state=0)
                clf.fit(X_train, Y)
                score = clf.score(X_train, Y)
                scores[(column1, column2)] = score
            

        scores_sorted = dict( sorted(scores.items(), key=operator.itemgetter(1),reverse=True))
        best_feature_set = list(scores_sorted.keys())[0]
        best_feature1_values = np.unique(X[best_feature_set[0]])
        best_feature2_values = np.unique(X[best_feature_set[1]])

        best_feature1_values.sort()
        best_feature2_values.sort()

        partition = self.get_partition(X, Y, best_feature_set[0], best_feature_set[1], best_feature1_values, best_feature2_values, current_node.depth)
        
        if partition == None:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        (X_left1, Y_left1), (X_left2, Y_left2), (X_right1, Y_right1), (X_right2, Y_right2), threshold1, threshold2 = partition

        threshold = [threshold1, threshold2]
        # print("Thresholds: ",threshold)
        current_node.threshold = threshold
        current_node.feature = best_feature_set
        current_node.name = f'{best_feature_set} {threshold} {self.node_count}'
        self.node_count += 1
        current_node.left_child1 = Node2('unnamed', depth=current_node.depth + 1)
        current_node.left_child2 = Node2('unnamed', depth=current_node.depth + 1)
        current_node.right_child1 = Node2('unnamed', depth=current_node.depth + 1)
        current_node.right_child2 = Node2('unnamed', depth=current_node.depth + 1)

        unq, counts = np.unique(Y, return_counts=True)
        max_freq_idx = np.argmax(counts).flatten()
        if(X_left1.shape[0] == 0):
            current_node.left_child1.is_leaf = True
            current_node.left_child1.value = unq[max_freq_idx].squeeze()
            current_node.left_child1.name = f'leaf {current_node.left_child1.value} {self.node_count}'
            self.node_count+=1 
            # return
        
        if(X_left2.shape[0] == 0):
            current_node.left_child2.is_leaf = True
            current_node.left_child2.value = unq[max_freq_idx].squeeze()
            current_node.left_child2.name = f'leaf {current_node.left_child2.value} {self.node_count}'
            self.node_count+=1 
            # return
        
        if(X_right1.shape[0] == 0):
            current_node.right_child1.is_leaf = True
            current_node.right_child1.value = unq[max_freq_idx].squeeze()
            current_node.right_child1.name = f'leaf {current_node.right_child1.value} {self.node_count}'
            self.node_count+=1   
            # return     
        
        if(X_right2.shape[0] == 0):
            current_node.right_child2.is_leaf = True
            current_node.right_child2.value = unq[max_freq_idx].squeeze()
            current_node.right_child2.name = f'leaf {current_node.right_child2.value} {self.node_count}'
            self.node_count+=1  
            # return                  
          
        if(X_left1.shape[0] != 0):
            self.fit_util(X_left1, Y_left1, current_node.left_child1)
        
        if(X_left2.shape[0] != 0):
            self.fit_util(X_left2, Y_left2, current_node.left_child2)
        
        if(X_right1.shape[0] != 0):
            self.fit_util(X_right1, Y_right1, current_node.right_child1)
        
        if(X_right2.shape[0] != 0):
            self.fit_util(X_right2, Y_right2, current_node.right_child2)


    def do_split(self, X, thresh):
        """
            Split the data at a node based on threshold
        """

        left_child_ids = np.where(X <= thresh, True, False)
        right_child_ids = np.where(X > thresh, True, False)
        return left_child_ids, right_child_ids

    def do_split_final(self, X1, X2, thresh1, thresh2):

        """
            Split according to the best thresholds for the 2 features
        """   

        left1_ids = np.where(np.logical_and(X1 <= thresh1, X2 <= thresh2), True, False)
        left2_ids = np.where(np.logical_and(X1 > thresh1, X2 <= thresh2), True, False)
        right1_ids = np.where(np.logical_and(X1 <= thresh1, X2 > thresh2), True, False)
        right2_ids = np.where(np.logical_and(X1 > thresh1, X2 > thresh2), True, False)

        return left1_ids, left2_ids, right1_ids, right2_ids

    def find_entropy(self, Y):
        probs = []
        possible_classes, counts = np.unique(Y, return_counts=True)
        sort_indices = np.argsort(possible_classes)
        possible_classes = possible_classes[sort_indices]
        counts = counts[sort_indices]
        
        for class_label, count in zip(possible_classes, counts):
            probs.append(count/Y.shape[0])
        
        entropy = 0
        for prob in probs:
            entropy -= prob*np.log2(prob)
        
        return entropy
    
    def find_best_thresh_info_gain(self, thresholds1, thresholds2, X, Y, feature1, feature2):
        """
            This function finds the best threshold and info gain for a given feature
        """
        best_info_gain = -float('inf')
        best_thresh1 = thresholds1[0]   
        best_thresh2 = thresholds2[0]     
        threshold_cnt_1 = len(thresholds1)
        threshold_cnt_2  = len(thresholds2)
        if self.max_thresholds is not None:
            threshold_cnt_1, threshold_cnt_2 = self.max_thresholds, self.max_thresholds
        min_unique_values_1 = len(thresholds1)
        min_unique_values_2 = len(thresholds2)

        if self.min_unique_values is not None:
            min_unique_values_1, min_unique_values_2 = self.min_unique_values, self.min_unique_values
        
        print("Now thresholding!")
        for thresh1 in (thresholds1[:-1] if len(thresholds1) <= min_unique_values_1 else thresholds1[:-1:len(thresholds1) // threshold_cnt_1]):
            for thresh2 in (thresholds2[:-1] if len(thresholds2) <= min_unique_values_2 else thresholds2[:-1:len(thresholds2) // threshold_cnt_2]):
                left_child1_ids, left_child2_ids,  right_child1_ids, right_child2_ids = self.do_split_final(X[feature1].to_numpy(),X[feature2].to_numpy(), thresh1, thresh2)

                parent_pts = X.shape[0]
                left_child1_pts = len(left_child1_ids)
                left_child2_pts = len(left_child2_ids)
                right_child1_pts = len(right_child1_ids)
                right_child2_pts = len(right_child2_ids)

                info_gain = self.find_entropy(Y) - (left_child1_pts / parent_pts) * self.find_entropy(Y[left_child1_ids]) - (left_child2_pts / parent_pts) * self.find_entropy(Y[left_child2_ids]) - (right_child1_pts / parent_pts) * self.find_entropy(Y[right_child1_ids]) - (right_child2_pts / parent_pts) * self.find_entropy(Y[right_child2_ids])
                
                if(info_gain > best_info_gain):

                    best_info_gain = info_gain
                    best_thresh1 = thresh1
                    best_thresh2 = thresh2

        
        return best_thresh1, best_thresh2, best_info_gain

    def get_partition(self, X, Y, feature1, feature2, thresholds1, thresholds2, current_node_depth):
        '''
            This function should return left and right
            partitions according to appropritate
            partitioning algorithm. Return None if all
            data has same label
        '''
        
        # if only 1 class available at a node OR MIN_SAMPLES left at a node - Leaf Node reached
        if(len(Y) < self.min_samples):
            return None
        if(current_node_depth == self.max_depth):
            return None
        
        best_thresh1, best_thresh2, best_info_gain = self.find_best_thresh_info_gain(thresholds1, thresholds2, X, Y, feature1, feature2)

        # partition according to best threshold
        best_left_ids1, best_left_ids2, best_right_ids1, best_right_ids2 = self.do_split_final(X[feature1].to_numpy(), X[feature2].to_numpy(), best_thresh1, best_thresh2)

        # print(len(Y[best_left_ids1]), len(Y[best_left_ids2]), len(Y[best_right_ids1]) , len(Y[best_right_ids2]))

        count_0 = 0
        if(len(Y[best_left_ids1]) == 0):
            count_0 += 1
        if(len(Y[best_left_ids2]) == 0):
            count_0 += 1
        if(len(Y[best_right_ids1]) == 0):
            count_0 += 1
        if(len(Y[best_right_ids2]) == 0):
            count_0 += 1

        if(count_0 == 3):
            print(len(Y[best_left_ids1]), len(Y[best_left_ids2]), len(Y[best_right_ids1]) , len(Y[best_right_ids2]))
            return None

        return (X[best_left_ids1], Y[best_left_ids1]), (X[best_left_ids2], Y[best_left_ids2]), (X[best_right_ids1], Y[best_right_ids1]),(X[best_right_ids2], Y[best_right_ids2]), best_thresh1, best_thresh2    
    
    def print_tree(self):
        tree = Tree()
        self.print_tree_util(self.root_node, tree)
        tree.show()
        return tree

    def print_tree_util(self, root, tree, parent=None):
        if parent is not None:
            print(root.name)
            tree.create_node(root.name, root.name, parent=parent.name)
        else:
            print(root.name)
            tree.create_node(root.name, root.name)
        if root.is_leaf:
            return
        self.print_tree_util(root.left_child1, tree, root)
        self.print_tree_util(root.left_child2, tree, root)
        self.print_tree_util(root.right_child1, tree, root)
        self.print_tree_util(root.right_child2, tree, root)


In [101]:
# def paired_t_test(p):
#     p_hat = np.mean(p)
#     n = len(p)
#     den = np.sqrt(sum([(diff - p_hat)**2 for diff in p]) / (n - 1))
#     t = (p_hat * (n**(1/2))) / den
    
#     p_value = t_dist.sf(t, n-1)*2
    
#     return t, p_value

# def z_test(acc1, acc2, n):

#     z, p = proportion_difference(acc1, acc2, n_1 = n)
#     return z,p

# def mcnamer_test(Y_test, Y_pred1, Y_pred2):
#     b = sum(np.logical_and((Y_pred2 != Y_test), (Y_pred1 == Y_test)))
#     c = sum(np.logical_and((Y_pred1 != Y_test), (Y_pred2 == Y_test)))

#     chi_2 = ((np.abs(b-c)-1)**2)/(b+c)
#     p = chi2.sf(chi_2, 1)

#     return chi_2, p

In [39]:
df_cancer = pd.read_csv('./data/final_data1.csv')

In [56]:
X_cancer = df_cancer.drop(['Unnamed: 0', 'Biopsy'], axis=1)
Y_cancer = df_cancer['Biopsy'].to_numpy()
X_train_cancer, X_test_cancer, Y_train_cancer, Y_test_cancer = train_test_split(X_cancer, Y_cancer, test_size=0.2, random_state = 42, stratify=Y_cancer)
print(X_train_cancer.shape)
print(np.unique(Y_train_cancer, return_counts=True))
print(np.unique(Y_test_cancer, return_counts=True))

(686, 30)
(array([0, 1], dtype=int64), array([642,  44], dtype=int64))
(array([0, 1], dtype=int64), array([161,  11], dtype=int64))


In [57]:
cols_cancer = X_cancer.columns
cols_d_cancer = {}
id=0
for col in cols_cancer:
    cols_d_cancer[col] = id
    id+=1

In [62]:
dt_cancer1 = MyDecisionTree(max_depth=5, num_random_columns=1)
dt_cancer1.fit(X_train_cancer, Y_train_cancer)
Y_pred1_cancer = dt_cancer1.predict(np.array(X_test_cancer.to_numpy()), cols_d_cancer)
accuracy1 = accuracy_score(Y_pred1_cancer, Y_test_cancer)
precision1 = precision_score(Y_pred1_cancer, Y_test_cancer)
recall1 = recall_score(Y_pred1_cancer, Y_test_cancer)
f1 = f1_score(Y_pred1_cancer, Y_test_cancer)
print("One-feature split accuracy: ", accuracy1)
print("One-feature split Precision: ", precision1)
print("One-feature split Recall: ", recall1)
print("One-feature split F1-Score: ", f1)

['STDs:Hepatitis B']
Now thresholding!
['Age']
Now thresholding!
['STDs:pelvic inflammatory disease']
Now thresholding!
['Num of pregnancies']
Now thresholding!
['STDs: Number of diagnosis']
Now thresholding!
['STDs:pelvic inflammatory disease']
One-feature split accuracy:  0.936046511627907
One-feature split Precision:  0.0
One-feature split Recall:  0.0
One-feature split F1-Score:  0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
tree = dt_cancer1.print_tree()

Age 70 0
├── leaf 0 1
└── leaf 0 2



In [26]:
pickle.dump(dt_cancer1, open("Results/Data1/1_Way/Models_Saved/Model.pkl", 'wb'))
metrics_array = np.array([accuracy1, precision1, recall1, f1])

with open('./Results/Data1/1_Way/Metrics/metrics.npy', 'wb') as f:
    np.save(f, metrics_array)

with open('./Results/Data1/1_Way/Y_Pred/Pred.npy', 'wb') as f:
    np.save(f, Y_pred1_cancer)


In [27]:
pickled_model = pickle.load(open("Results/Data1/1_Way/Models_Saved/Model.pkl", 'rb'))
Y_ = pickled_model.predict(np.array(X_test_cancer.to_numpy()), cols_d_cancer)
tree = pickled_model.print_tree()

Age 70 0
├── Age 52 1
│   ├── Age 51 2
│   │   ├── Age 50 3
│   │   │   ├── Age 49 4
│   │   │   │   ├── leaf 0 5
│   │   │   │   └── leaf 0 6
│   │   │   └── leaf 1 7
│   │   └── leaf 0 8
│   └── leaf 0 9
└── leaf 0 10



In [28]:
acc_scores1 = []
precision_scores1 = []
recall_scores1 = []
f1_scores1 = []
strtfdKFold = StratifiedKFold(n_splits=5)
for train_index, test_index in strtfdKFold.split(X_cancer, Y_cancer):
    X_train, X_test = X_cancer.iloc[train_index, :], X_cancer.iloc[test_index, :]
    Y_train, Y_test = Y_cancer[train_index], Y_cancer[test_index]

    dt1_cancer = MyDecisionTree(min_samples=1,  )
    dt1_cancer.fit(X_train, Y_train)
    Y_pred1_cancer = dt1_cancer.predict(np.array(X_test.to_numpy()), cols_d_cancer)

    accuracy1 = accuracy_score(Y_pred1_cancer, Y_test)
    precision1 = precision_score(Y_pred1_cancer, Y_test)
    recall1 = recall_score(Y_pred1_cancer, Y_test)
    f1 = f1_score(Y_pred1_cancer, Y_test)

    acc_scores1.append(accuracy1)
    precision1.append(precision1)
    recall1.append(recall1)
    f1_scores1.append(f1)

    print("----------")
    print("Accuracy1: ", accuracy1)
    print("Precision1: ", precision1)
    print("Recall1: ", recall1)
    print("F1-score: ", f1)
    print("----------")


with open("Results/Data1/1_Way/K_Folds/Accuracy/acc.npy",'wb') as f:
    np.save(f, np.array(acc_scores1))

with open("Results/Data1/1_Way/K_Folds/Precision/precision.npy", 'wb') as f:
    np.save(f, np.array(precision1))

with open("Results/Data1/1_Way/K_Folds/Recall/recall.npy", 'wb') as f:
    np.save(f, np.array(recall1))

with open("Results/Data1/1_Way/K_Folds/F1_Score/f1.npy", 'wb') as f:
    np.save(f, np.array(f1_scores1))



Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!


KeyboardInterrupt: 

In [15]:
dt_cancer2 = MyDecisionTree2(min_samples=1, num_random_columns=30)
dt_cancer2.fit(X_train_cancer, Y_train_cancer)

Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
596 0 0 0
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
28 0 0 0
Now thresholding!
Now thresholding!
Now thresholding!
10 0 0 0
Now thresholding!
Now thresholding!
Now thresholding!
Now thresholding!
12 0 0 0


In [16]:
Y_pred2_cancer = dt_cancer2.predict(np.array(X_test_cancer.to_numpy()), cols_d_cancer)
accuracy2 = accuracy_score(Y_pred2_cancer, Y_test_cancer)
print("two-features split accuracy: ", accuracy2)
f1_score(Y_pred2_cancer, Y_test_cancer)

two-features split accuracy:  0.9302325581395349


0.25000000000000006

In [None]:
t, p_value = paired_t_test(acc_scores_diff)
print(t, p_value)

2.8200629894731226 0.04782506579017487
