In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from treelib import Tree


In [3]:
class Node:
    def __init__(self, name, feature=None, threshold=None, left_child=None, right_child=None, is_leaf=False, value=-1, depth=-1):
        self.feature = feature
        self.threshold = threshold
        self.left_child = left_child
        self.right_child = right_child
        self.is_leaf = is_leaf
        self.value = value
        self.name = name
        self.depth = depth

In [4]:
class MyDecisionTree:
    def __init__(self, min_samples=1, max_depth=-1):
        self.root_node = Node('root', depth=0)
        self.node_count = 0
        self.min_samples = min_samples
        self.max_depth = max_depth
    
    def predict(self, X, cols_d):
        
        Y_pred = []
        for i in range(len(X)):

            x_i = X[i]
            # print(x_i[0])
            curr_node = self.root_node
            predicted_y = np.array(self.predict_util(x_i, curr_node, cols_d))
            # print(predicted_y)
            Y_pred.append(predicted_y)

        return np.array(Y_pred)

    def predict_util(self, x, curr_node, cols_d):
        
        print(curr_node.feature)
        if(curr_node.is_leaf):
            return curr_node.value
        if x[cols_d[curr_node.feature]] <= curr_node.threshold:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.left_child, cols_d)
        else:
            if curr_node.is_leaf:
                return curr_node.value
            return self.predict_util(x, curr_node.right_child, cols_d)

    def fit(self, X, Y):
        self.fit_util(X, Y, self.root_node)


    def fit_util(self, X, Y, current_node):
        if np.unique(Y).shape[0] == 1:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        scores = []
        for column in X.columns:
            X_train = X[column].to_numpy()
            X_train = X_train.reshape(X_train.shape[0], 1)
            clf = LogisticRegression(random_state=0, max_iter=100)
            clf.fit(X_train, Y)
            score = clf.score(X_train, Y)
            scores.append(score)
        scores = np.array(scores)
        best_feature = X.columns[np.argmax(scores)]
        best_feature_values = np.unique(X[best_feature])
        best_feature_values.sort()

        partition = self.get_partition(X, Y, best_feature, best_feature_values, current_node.depth)
        
        if partition == None:
            current_node.is_leaf = True
            unq, counts = np.unique(Y, return_counts=True)
            max_freq_idx = np.argmax(counts).flatten()
            current_node.value = unq[max_freq_idx].squeeze()
            current_node.name = f'leaf {current_node.value} {self.node_count}'
            self.node_count += 1
            return
            
        (X_left, Y_left), (X_right, Y_right), threshold = partition
        current_node.threshold = threshold
        current_node.feature = best_feature
        current_node.name = f'{best_feature} {threshold} {self.node_count}'
        self.node_count += 1
        current_node.left_child = Node('No name', depth=current_node.depth + 1)
        current_node.right_child = Node('No name', depth=current_node.depth + 1)

        self.fit_util(X_left, Y_left, current_node.left_child)
        self.fit_util(X_right, Y_right, current_node.right_child)

        

    def do_split(self, X, thresh):
        """
            Split the data at a node based on threshold
        """

        left_child_ids = np.where(X <= thresh, True, False)
        right_child_ids = np.where(X > thresh, True, False)
        return left_child_ids, right_child_ids
    
    def find_entropy(self, Y):
        probs = []
        possible_classes, counts = np.unique(Y, return_counts=True)
        sort_indices = np.argsort(possible_classes)
        possible_classes = possible_classes[sort_indices]
        counts = counts[sort_indices]
        
        for class_label, count in zip(possible_classes, counts):
            probs.append(count/Y.shape[0])
        
        entropy = 0
        for prob in probs:
            entropy -= prob*np.log2(prob)
        
        return entropy

    def get_partition(self, X, Y, feature, thresholds, current_node_depth):
        '''
            This function should return left and right
            partitions according to appropritate
            partitioning algorithm. Return None if all
            data has same label
        '''
        
        # if only 1 class available at a node OR MIN_SAMPLES left at a node - Leaf Node reached
        if(len(Y) < self.min_samples):
            return None
        if(current_node_depth == self.max_depth):
            return None
        
        best_info_gain = -float('inf')
        best_thresh = thresholds[0]        
        for thresh in thresholds[:-1]:

            left_child_ids, right_child_ids = self.do_split(X[feature].to_numpy(), thresh)

            parent_pts = X.shape[0]
            left_child_pts = len(left_child_ids)
            right_child_pts = len(right_child_ids)

            info_gain = self.find_entropy(Y) - (left_child_pts / parent_pts) * self.find_entropy(Y[left_child_ids]) - (right_child_pts / parent_pts) * self.find_entropy(Y[right_child_ids])

            if(info_gain > best_info_gain):

                best_info_gain = info_gain
                best_thresh = thresh
        
        # partition according to best threshold
        best_left_ids, best_right_ids = self.do_split(X[feature].to_numpy(), best_thresh)
        print(len(Y[best_left_ids]), len(Y[best_right_ids]))
        if(len(Y[best_left_ids]) == 0 or len(Y[best_right_ids]) == 0):
            print(len(Y[best_left_ids]), len(Y[best_right_ids]))
            return None
        return (X[best_left_ids], Y[best_left_ids]), (X[best_right_ids], Y[best_right_ids]), best_thresh
    
    def print_tree(self):
        tree = Tree()
        self.print_tree_util(self.root_node, tree)
        tree.show()
        return tree

    def print_tree_util(self, root, tree, parent=None):
        if parent is not None:
            tree.create_node(root.name, root.name, parent=parent.name)
        else:
            tree.create_node(root.name, root.name)
        if root.is_leaf:
            return
        self.print_tree_util(root.left_child, tree, root)
        self.print_tree_util(root.right_child, tree, root)


In [13]:
df_cancer = pd.read_csv('pre-processed_cancer.csv')

In [14]:
df_cancer.columns

Index(['Unnamed: 0', 'Age', 'Number of sexual partners',
       'First sexual intercourse', 'Num of pregnancies', 'Smokes',
       'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives',
       'Hormonal Contraceptives (years)', 'IUD', 'IUD (years)', 'STDs',
       'STDs (number)', 'STDs:condylomatosis', 'STDs:cervical condylomatosis',
       'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
       'STDs:syphilis', 'STDs:pelvic inflammatory disease',
       'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS',
       'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology', 'Biopsy'],
      dtype='object')

In [20]:
X_cancer = df_cancer.drop(['Unnamed: 0', 'Biopsy'], axis=1)
Y_cancer = df_cancer['Biopsy'].to_numpy()
X_train_cancer, X_test_cancer, Y_train_cancer, Y_test_cancer = train_test_split(X_cancer, Y_cancer, test_size=0.2, random_state=42)
print(X_train_cancer.shape)

(686, 35)


In [37]:
from matplotlib import test


dt_cancer = MyDecisionTree(min_samples = 1, max_depth = 3)
dt_cancer.fit(X_train_cancer, Y_train_cancer)

629 57


In [38]:
unq, counts = np.unique(Y_test_cancer, return_counts=True)
print(counts)

[161  11]


In [39]:
tree = dt_cancer.print_tree()

Schiller 0 0
├── leaf 0 1
└── leaf 1 2



In [40]:
cols = df_cancer.columns
cols_d = {}
id=0
for col in cols:
    cols_d[col] = id
    id+=1
Y_pred_cancer = dt_cancer.predict(np.array(X_test_cancer.to_numpy()), cols_d)
Y_pred_cancer

Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schill

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [41]:
accuracy = (Y_pred_cancer == Y_test_cancer).sum() / Y_test_cancer.shape[0]
print(accuracy)

0.9302325581395349


BANKING DATA

In [30]:
df_banking = pd.read_csv("../data/pre-processed_banking.csv")

In [31]:
df_banking.columns

Index(['Unnamed: 0', 'age', 'job', 'marital', 'education', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx',
       'cons_conf_idx', 'euribor3m', 'nr_employed', 'y'],
      dtype='object')

In [32]:
X_banking = df_banking.drop(['Unnamed: 0', 'y'], axis=1)
Y_banking = df_banking['y'].to_numpy()
X_train_banking, X_test_banking, Y_train_banking, Y_test_banking = train_test_split(X_banking, Y_banking, test_size=0.2, random_state=42)
print(X_train_banking.shape)

(32950, 19)


In [33]:
dt = MyDecisionTree(min_samples = 686)
dt.fit(X_train_banking, Y_train_banking)

4480 28470
1053 3427
22 1031
1030 1
2 1028
1002 26
1001 1
982 19
950 32
838 112
837 1
1 836
835 1
834 1
833 1
832 1
831 1
830 1
829 1
828 1
827 1
826 1
825 1
824 1
823 1
822 1
821 1
820 1
819 1
818 1
817 1
816 1
815 1
814 1
813 1
812 1
811 1
810 1
809 1
808 1
807 1
806 1
805 1
20 785
2 783
1 782
703 79
702 1
701 1
700 1
699 1
698 1
697 1
696 1
695 1
693 2
692 1
691 1
690 1
689 1
688 1
687 1
685 2
1 3426
3425 1
3423 2
3422 1
3419 3
3418 1
3417 1
3416 1
3413 3
3412 1
3411 1
3410 1
3409 1
3407 2
3406 1
3405 1
3404 1
3401 3
3400 1
3399 1
3398 1
3397 1
3396 1
3395 1
3393 2
3392 1
3385 7
3384 1
3383 1
3382 1
3377 5
3376 1
3375 1
3374 1
3372 2
3371 1
3368 3
3367 1
3366 1
3365 1
3364 1
3363 1
3362 1
3361 1
3360 1
3355 5
3354 1
3352 2
5 3347
6 3341
6 3335
3 3332
7 3325
5 3320
6 3314
3 3311
6 3305
3 3302
4 3298
6 3292
6 3286
8 3278
3 3275
2 3273
4 3269
10 3259
7 3252
5 3247
4 3243
5 3238
3 3235
1 3234
4 3230
1 3229
4 3225
2 3223
3 3220
3 3217
3 3214
6 3208
4 3204
7 3197
5 3192
5 3187
7 3180
5 31

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stab

886 9
10 876
16 860
14 846


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


824 22
800 24


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


777 23
765 12


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


743 22
725 18
710 15
697 13
674 23
66 25876
77 25799
91 25708
85 25623
80 25543
86 25457
102 25355
73 25282
84 25198
73 25125
551 24574
722 23852
721 1
720 1
719 1
718 1
2 716
1 715
713 2
712 1
3 709
705 4
11 694
8 686
11 675
214 23638
411 23227
692 22535
691 1
690 1
689 1
688 1
687 1
686 1
685 1
626 21909
203 21706
597 21109
105 21004
200 20804
824 19980
823 1
822 1
821 1
820 1
819 1
2 817
815 2
3 812
3 809
2 807
801 6
790 11
12 778
17 761
15 746
462 284
1020 18960
1 1019
1 1018
1017 1
1016 1
1015 1
1014 1
1013 1
2 1011
2 1009
1005 4
1004 1
1003 1
1002 1
998 4
986 12
980 6
970 10
940 30
687 253
542 145
112 18848
619 18229
438 17791
121 17670
1493 16177
1 1492
5 1487
5 1482
804 678
716 88
671 45
180 15997
1724 14273
1722 2
1719 3
1718 1
1716 2
1077 639
1018 59
896 122
568 328
661 13612
551 13061
480 12581
470 12111
123 11988
204 11784
710 11074
424 286
115 10959
861 10098
495 366
442 9656
614 9042
83 8959
859 8100
1 858
857 1
856 1
854 2
2 852
655 197
220 7880
249 7631
259 7372
135 723

In [34]:
tree = dt.print_tree()

poutcome 2 0
├── duration 0 416
│   ├── duration 1 418
│   │   ├── duration 2 420
│   │   │   ├── duration 4199 422
│   │   │   │   ├── duration 3785 423
│   │   │   │   │   ├── duration 3643 424
│   │   │   │   │   │   ├── duration 3422 425
│   │   │   │   │   │   │   ├── duration 3366 426
│   │   │   │   │   │   │   │   ├── duration 3253 427
│   │   │   │   │   │   │   │   │   ├── duration 3183 428
│   │   │   │   │   │   │   │   │   │   ├── duration 2635 429
│   │   │   │   │   │   │   │   │   │   │   ├── duration 2621 430
│   │   │   │   │   │   │   │   │   │   │   │   ├── duration 2316 431
│   │   │   │   │   │   │   │   │   │   │   │   │   ├── duration 2301 432
│   │   │   │   │   │   │   │   │   │   │   │   │   │   ├── duration 2260 433
│   │   │   │   │   │   │   │   │   │   │   │   │   │   │   ├── duration 2231 434
│   │   │   │   │   │   │   │   │   │   │   │   │   │   │   │   ├── duration 2191 435
│   │   │   │   │   │   │   │   │   │   │   │   │   │   │   │   │   ├── durati

In [35]:
tree.to_graphviz()

digraph tree {
	"poutcome 2 0" [label="poutcome 2 0", shape=circle]
	"duration 0 416" [label="duration 0 416", shape=circle]
	"poutcome 1 1" [label="poutcome 1 1", shape=circle]
	"duration 1 418" [label="duration 1 418", shape=circle]
	"leaf 0 417" [label="leaf 0 417", shape=circle]
	"duration 1 125" [label="duration 1 125", shape=circle]
	"duration 62 2" [label="duration 62 2", shape=circle]
	"duration 2 420" [label="duration 2 420", shape=circle]
	"leaf 0 419" [label="leaf 0 419", shape=circle]
	"duration 2184 127" [label="duration 2184 127", shape=circle]
	"leaf 0 126" [label="leaf 0 126", shape=circle]
	"euribor3m 4.191 4" [label="euribor3m 4.191 4", shape=circle]
	"leaf 0 3" [label="leaf 0 3", shape=circle]
	"duration 4199 422" [label="duration 4199 422", shape=circle]
	"leaf 0 421" [label="leaf 0 421", shape=circle]
	"duration 1745 128" [label="duration 1745 128", shape=circle]
	"leaf 0 415" [label="leaf 0 415", shape=circle]
	"euribor3m 0.634 5" [label="euribor3m 0.634 5", shape

In [42]:
cols = df_banking.columns
cols_d = {}
id=0
for col in cols:
    cols_d[col] = id
    id+=1
Y_pred_banking = dt.predict(np.array(X_test_banking.to_numpy()), cols_d)
Y_pred_banking

poutcome
poutcome
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
None
poutcome
poutcome
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
duration
None
poutcome
poutcome
duration
None
poutcome
poutcome
duration
duration
dura

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [43]:
accuracy = (Y_pred_banking == Y_test_banking).sum() / Y_test_banking.shape[0]
print(accuracy)

0.8855304685603301


In [32]:
digits = load_digits()
X_digits, Y_digits = digits.data, digits.target
X_digits = X_digits/255
X_digits_train,X_digits_test,Y_digits_train,Y_digits_test=train_test_split(X_digits,Y_digits,test_size=0.2,random_state=42)
X_digits_train = pd.DataFrame(X_digits_train)
X_digits_test = pd.DataFrame(X_digits_test)

print(X_digits_train.shape)
print(X_digits_test.shape)

# X_iris = pd.DataFrame(X_iris)
# print(X_iris.shape)

(1437, 64)
(360, 64)


In [33]:
dt_digits = MyDecisionTree(min_samples=1)
dt_digits.fit(X_digits_train, Y_digits_train)
tree = dt_digits.print_tree()

372 1065
372 0
372 0
249 816
249 0
249 0
276 540
276 0
276 0
540 0
540 0
34 0.0 0
├── 53 0.0 2
│   ├── 21 0.0 4
│   │   ├── leaf 0 6
│   │   └── leaf 6 5
│   └── leaf 7 3
└── leaf 3 1



In [34]:
pred_y_digits = dt_digits.predict(X_digits_test.to_numpy()).squeeze()
# print(pred_y_digits)

accuracy = (pred_y_digits == Y_digits_test).sum() / Y_digits_test.shape[0]
print(accuracy)

0.3472222222222222


In [37]:
tree.to_graphviz()

digraph tree {
	"3 0.6 0" [label="3 0.6 0", shape=circle]
	"3 1.7 2" [label="3 1.7 2", shape=circle]
	"leaf 0 1" [label="leaf 0 1", shape=circle]
	"0 5.6 20" [label="0 5.6 20", shape=circle]
	"2 5.1 3" [label="2 5.1 3", shape=circle]
	"0 5.7 22" [label="0 5.7 22", shape=circle]
	"leaf 2 21" [label="leaf 2 21", shape=circle]
	"0 6.9 4" [label="0 6.9 4", shape=circle]
	"leaf 2 19" [label="leaf 2 19", shape=circle]
	"0 5.8 24" [label="0 5.8 24", shape=circle]
	"leaf 2 23" [label="leaf 2 23", shape=circle]
	"0 6.7 5" [label="0 6.7 5", shape=circle]
	"leaf 1 18" [label="leaf 1 18", shape=circle]
	"0 7.6 26" [label="0 7.6 26", shape=circle]
	"leaf 2 25" [label="leaf 2 25", shape=circle]
	"0 6.6 6" [label="0 6.6 6", shape=circle]
	"leaf 1 17" [label="leaf 1 17", shape=circle]
	"leaf 2 27" [label="leaf 2 27", shape=circle]
	"leaf 2 28" [label="leaf 2 28", shape=circle]
	"0 6.5 7" [label="0 6.5 7", shape=circle]
	"leaf 1 16" [label="leaf 1 16", shape=circle]
	"0 6.4 8" [label="0 6.4 8", shape=c

In [62]:

print(df.to_numpy())
cols = df.columns
cols_d = {}
id=0
for col in cols:
    cols_d[col] = id
    id+=1
Y_pred = dt.predict(np.array(X_test.to_numpy()), cols_d)
Y_pred

[[  0.  18.   4. ...   0.   0.   0.]
 [  1.  15.   1. ...   0.   0.   0.]
 [  2.  34.   1. ...   0.   0.   0.]
 ...
 [855.  25.   2. ...   0.   1.   0.]
 [856.  33.   2. ...   0.   0.   0.]
 [857.  29.   2. ...   0.   0.   0.]]
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Schiller
None
Sc

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [63]:
accuracy = (Y_pred == Y_test).sum() / Y_test.shape[0]
print(accuracy)

0.9302325581395349


In [65]:
161/172

0.936046511627907