### IMPORT THE PACKAGES NUMPY, PANDAS, SCKIT-LEARN

Used the scikit-learn decision tree to compare results, additionally used accuracy score for the metric and made use of this package for scoring the models. The Ordinal encoder was used in order to convert the nominal text values to scores to run in the decision tree.

In [63]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as Scikit_DTC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder

### IMPORT THE IRIS, WINE, AND BREAST-CANCER DATASETS

In [64]:
# iris dataset
dataset1 = pd.read_csv("iris.tmls")
# store data type
datatype1 = dataset1.iloc[:1, :-1]
# remove data type
dataset1 = dataset1.iloc[1:, :]
# wine dataset
dataset2 = pd.read_csv("wine.tmls")
# store data type
datatype2 = dataset2.iloc[:1, :-1]
# remove data type
dataset2 = dataset2.iloc[1:, :]
# breast-cancer dataset
dataset3 = pd.read_csv("breast-cancer.tmls")
# save data type
datatype3 = dataset3.iloc[:1, 1:]
# split dataset for ordinal encoding of the features
target3 = dataset3.iloc[1:, 0].reset_index(drop=True)
X3 = dataset3.iloc[1:, 1:]
oc = OrdinalEncoder()
# the encoded X
encodedDataset = oc.fit_transform(X3)
# convert X from numpy array to pandas dataframe
X3 = pd.DataFrame(encodedDataset)
# give original feature name back to columns
X3 = X3.rename({ 0: 'age', 1: 'menopause', 2: 'tumor-size', 3: 'inv-nodes', 4: 'node-caps', 5: 'deg-malig', 6: 'breast', 7: 'breast-quad', 8: 'irradiate'}, axis=1)
# combine X and y
dataset3 = pd.concat([X3, target3], axis=1)
cv = 10

### CROSS-VALIDATOR TO SPLIT THE DATASETS INTO EQUAL PARTS

The datasets were tested on 10-fold cross-validation

In [65]:
def cross_validator(number: int, size: int, dataset):
    parts = []
    start = 0
    end = size
    for i in range(number):
        parts.append(dataset.iloc[start:end, :])
        start += size
        end += size
    return parts

### THE IMPURITY CALCULATIONS FUNCTIONS CLASS PROBABILITIES ENTROPY AND GINI INDEX

In [66]:

def class_probabilities(target_labels: list[any]) -> list[float]:
    # class labels
    labels = np.unique(target_labels)
    # print("Target Labels: ", labels)
    # total instances in the current y at this split
    total_instances = len(target_labels)
    # instantiate the class probabilities list for each of the current labels
    class_probs = []
    for label in labels:
        samples = []
        # if the sample contains the same label it is appened to the samples list
        samples = [samples.append(sample) for sample in target_labels if label == sample]
        # count is the class count of the current label
        count = len(samples)
        # calculate class probabibilty of the current
        class_probability = count/total_instances
        # append the class probablility for the list
        class_probs.append(class_probability)
    return class_probs

def calc_entropy(class_probs: list[float]):
    return sum(- p * np.log2(p) for p in class_probs if p > 0)

def calc_gini_index(class_probs: list[float]):
    return sum(2 * p * (1 - p) for p in class_probs if p > 0)

### NODE CLASS USED WITHIN DECISION TREE

In [67]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, dType=None, *, value=None):
        # store feature of the split
        self.feature = feature
        # store the threshold for splitting to left or right branch
        self.threshold = threshold
        # store the left node/leaf
        self.left = left
        # store the right node/leaf
        self.right = right
        # store the data type nominal or real
        self.dType = dType
        # value if leaf
        self.value = value

    def is_leaf(self):
        # check to see if is a leaf
        return self.value is not None

    def print_node(self):
        # print node used for debugging
        if self.is_leaf():
            print("Leaf Value: ", self.value)
        else:
            print("Branch: ")
            print("Feature: ", self.feature)
            print("Threshold: ", self.threshold)
            print("Left: ", self.left)
            print("Right: ", self.right)
            print("Data Type: ", self.dType)

### DECISION TREE CLASSIFIER CLASS

In [68]:
class DecisionTreeClassifier:
    def __init__(self, purity_function = 'gini', max_depth=50):
        # set the purity function
        self.purity_function = purity_function
        # set max depth of the tree to handle overfitting
        self.max_depth = max_depth
        # the root node of the tree
        self.root = None

    def impurity(self, y):
        # list of calculated class probabilities of y
        class_probs = class_probabilities(y)
        # initialized purity
        purity = -1
        if self.purity_function == 'gini':
            # function call for the defualt gini index purity function
            purity = calc_gini_index(class_probs)
        else:
            # function call for entropy (not tested)
            purity = calc_entropy(class_probs)
        return purity

    def purity_gain(self, X_column, y, split_threshold, datatype):
        # calculate the Information Gain over the parent and child node
        # parents impurity
        d_parent = self.impurity(y)
        # split the values based on the threshold with split function
        left, right = self.split(X_column, split_threshold, datatype)
        # if the length of one of the splits was 0 then this would be pure
        if len(left) == 0 or len(right) == 0:
            return 0
        # length of all from the parent node
        d = len(y)
        # length of left or right child
        l, r = len(left), len(right)
        # impurity of left and right
        d_l, d_r = self.impurity(y.iloc[left]), self.impurity(y.iloc[right])
        # the purity of the children
        d_child = (l / d) * d_l + (r / d) * d_r
        return d_parent - d_child


    def homogeneous(self, y)-> bool:
        # function to check if the y values are homogeneous
        # get the class counts
        unique_classes, class_count = self.class_counts(y)
        # if the number of unique classes are one returns True
        # so that a leaf node with the label can be created
        if len(unique_classes) == 1:
            return True
        n_samples = y.shape[0]
        majority_value = 0.0
        # if the classes are not 100% homogeneous then check each value
        for index, cl in enumerate(unique_classes):
            # current class count
            current_class_count = class_count[index]
            # class proportion
            proportion = current_class_count / n_samples
            # if proportion is better than the previous majority value
            if proportion > majority_value:
                majority_value = proportion
        # the primary threshold for homgeneousness is 95%
        # if the majority values are greater than 95% then it would return true
        if majority_value > 0.95:
            return True
        # a check needs to be made so that if a problem exists where a node finishes with a tie below 5 samples
        elif n_samples < 5:
            if majority_value >= .50:
                return True
        return False

    def label(self, y):
        # label function is used when y contains a homogeneous amount the label is assigned to the leaf
        # get unique classes from the class count function
        unique_classes, class_counts = self.class_counts(y)
        # if unique classes is 1 then returns the unique class label
        if len(unique_classes) == 1:
            return unique_classes[0]
        # get y sample size
        n_samples = y.shape[0]
        # instantiate majority value
        majority_value = 0.0
        # instantiate majority class
        majority_class = ''
        # for each class get the context and proportion to the total number of samples
        for index, cl in enumerate(unique_classes):
            # count of the current class
            current_class_count = class_counts[index]
            # proportion of current class
            proportion = current_class_count / n_samples
            # if the proportion is greater than the current majority value
            if proportion > majority_value:
                # set majority value
                majority_value = proportion
                # set majority class
                majority_class = cl
        return majority_class

    def traverse_tree(self, x, node):
        # if node is a leaf return the value or the prediction
        if node.is_leaf():
            # prediction
            return node.value
        # if node dtype or data type is nominal or 'n'
        if node.dType == 'n':
            # if the feature value of x is equal to the threshold it will be the left child
            # if the value is not equal it will be in the right child
            if x[node.feature].values == node.threshold.astype(float):
                return self.traverse_tree(x, node.left)
            return self.traverse_tree(x, node.right)
        # if node dtype or data type is real value or 'r' and if the feature value of x is
        # less than or equal to the threshold it will be the left child, if the value is not
        # equal it will be in the right child
        elif node.dType == 'r' and x[node.feature].values <= node.threshold.astype(float):
            return self.traverse_tree(x, node.left)
        return self.traverse_tree(x, node.right)

    def best_split(self, X, y, n_features, datatype):
        # features from X
        features = X.columns
        # initialize best gain
        best_gain = -1
        # initialize best threshold
        best_threshold = 0.0
        # initialize best feature
        best_feature = ''
        # initialize data type
        dtype = ''
        # for every feature
        for feature in range(n_features):
            # get the data type
            dtype = datatype.iloc[:, feature].values
            # check if the data type is real values or 'r'
            if dtype == 'r':
                # get the feature column
                X_column = X.iloc[:, feature]
                # set the threshold as the mean of the feature column
                threshold = X_column.mean(axis=0)
                # get the gain from the purity gain function
                gain = self.purity_gain(X_column, y, threshold, dtype)
                # if the current gain is better apply values for change
                if gain > best_gain:
                    best_gain = gain
                    best_threshold = threshold
                    best_feature = features[feature]
            # check if the data type is nominal values or 'n'
            if dtype == 'n':
                # get the feature column
                X_column = X.iloc[:, feature]
                # get the nominal values from the unique values within dataframe
                nominal_values = np.unique(X_column)
                # each nominal value is used as a threshold
                for threshold in nominal_values:
                    # gain of this nominal value as the threshold
                    gain = self.purity_gain(X_column, y, threshold, dtype)
                    # if the current gain is better apply values for change
                    if gain > best_gain:
                        best_gain = gain
                        best_threshold = threshold
                        best_feature = features[feature]
            # the best feature, best threshold and data type are returned
        return best_feature, best_threshold, dtype

    def split(self, X, threshold, datatype):
        # the split function that handles the split and returns the indices
        # if the data type is 'r' or real values
        if datatype == 'r':
            # performs the threshold split based left split as values of less
            # than or equal to while those that are greater split to the right
            # indices
            left_indices = np.argwhere(X.values <= threshold).flatten()
            right_indices = np.argwhere(X.values > threshold).flatten()
        # if the data type is 'n' or nominal values
        if datatype == 'n':
            # performs the threshold split based left indices as values that are
            # equal to the threshold while all others would split to the right
            # indices
            left_indices = np.argwhere(X.values == threshold).flatten()
            right_indices = np.argwhere(X.values != threshold).flatten()
        return left_indices, right_indices


    def class_counts(self, y):
        # function that performs class count
        # get all unique values in the y or target feature
        unique_classes = np.unique(y)
        # initialize class counts to store all class counts
        class_counts = []
        # for every unique class samples are counted
        for cl in unique_classes:
            # current class samples are added to a list for a count of length
            # append to the list
            class_samples = []
            class_samples = [class_samples.append(sample) for sample in y if cl == sample]
            current_class_count = len(class_samples)
            class_counts.append(current_class_count)
        # the list of the unique classes as well as there counts are returned
        return unique_classes, class_counts

    def fit(self, X, y, datatype):
        # the start of the tree that will be based on the splits from the data
        self.root = self.grow_tree(X, y, datatype)


    def predict(self, X):
        # function for the prediction
        #initialize list for predictions
        predictions = []
        # for each x in X traverse tree to find a label
        for index in range(len(X)):
            # split into x
            x = X.iloc[[index]]
            # traverse tree function returning the label from the traversal
            prediction = self.traverse_tree(x, self.root)
            # add prediction to the list of predictions
            predictions.append(prediction)
        # predictions for all x are returned
        return predictions

    def grow_tree(self, X, y, datatype, depth=0):
        # grow tree function grows the tree recursivly
        # reset the index of the X and y
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        # number of samples and number of features
        n_samples, n_features = X.shape
        # check if the current node is already homogeneous also to see if depth
        if self.homogeneous(y) or depth > self.max_depth:
            # create a leaf with the value of the label
            leaf = Node(value=self.label(y))
            return leaf
        # the best split function returns the best feature, best threshold, and the data tyoe
        best_feature, best_threshold, d_type = self.best_split(X, y, n_features, datatype)
        # the best feature are used on the final split
        left, right = self.split(X[best_feature], best_threshold, d_type)
        # if the length of left and right are not equal to 0 the tree grows from this node
        # via left or right child
        if len(left) != 0:
            # left child of the current node
            left = self.grow_tree(X.iloc[left, :], y.iloc[left], datatype, depth + 1)
        if len(right) != 0:
            # right child of the current node
            right = self.grow_tree(X.iloc[right, :], y.iloc[right], datatype, depth + 1)

        return Node(best_feature, best_threshold, left, right, d_type)


### GET SIZES FOR TRAIN AND TEST FOR EACH OF THE DATASETS

In [69]:
# iris dataset
samples1, features1 = dataset1.shape
test_size1 = samples1 // cv
train_size1 = samples1 - test_size1
# Wine dataset
samples2, features2 = dataset2.shape
test_size2 = samples2 // cv
train_size2 = samples2 - test_size2
# breast-cancer dataset
samples3, features3 = dataset3.shape
test_size3 = samples3 // cv
train_size3 = samples3 - test_size3

### IRIS DATASET 10 X 10-FOLD CROSS VALIADATION

In [70]:
#Iris dataset
print("Iris Dataset")
# initialize sklearn test scores list
sklearn_test_scores = []
# initialize my decision tree test scores
my_dt_test_scores = []
# 10 different tests on a 10 fold cross validation
for times in range(10):
    # initialize current accuracy lists shuffle dataset
    sklearn_accuracy = []
    my_accuracy = []
    dataset1 = dataset1.sample(frac=1)
    dataset1 = dataset1.sample(frac=1)
    # split into 10 parts
    parts1 = cross_validator(cv, test_size1, dataset1)
    for fold in range(cv):
        # initialize scikit learn decision tree classifier
        scikitDT = Scikit_DTC()
        # iniitalize the decision tree classifie
        dt = DecisionTreeClassifier()
        # test part for the set on particular fold
        test = parts1[fold]
        # combine all parts except for the test part
        train_list  = [parts1[train_part] for train_part in range(cv) if fold != train_part]
        # train set
        train = pd.concat(train_list)
        # split into X for the features and y for target class for both test and train sets
        X_test = test.iloc[:, :-1].astype(float).reset_index(drop=True)
        y_test = test.iloc[:, -1].reset_index(drop=True)
        # both X train and X test sets have the datatype set to float so that numpy can operate
        X_train = train.iloc[:, :-1].astype(float).reset_index(drop=True)
        y_train = train.iloc[:, -1].reset_index(drop=True)
        # scikit learn fits the model then predics returning a list of predictions for y
        # adds the reults to scikit-learn list
        scikitDT.fit(X_train, y_train)
        sky_pred = scikitDT.predict(X_test)
        sklearn_acc = accuracy_score(sky_pred, y_test)
        print("sklearn fold ", fold, " accuracy: ", sklearn_acc)
        sklearn_accuracy.append(sklearn_acc)
        # the decision tree classifier train
        dt.fit(X_train, y_train, datatype1)
        # the decision tree classifier predictions
        y_pred = dt.predict(X_test)
        # append score to mydt scores list
        my_acc = accuracy_score(y_pred, y_test)
        my_accuracy.append(my_acc)
        print("My DT Fold", fold, " accuracy:", my_acc)
    # calculate average of 10 for cross validation
    sklearn_avg_accuracy = np.array(sklearn_accuracy).mean(axis=0)
    # add to overall scores list
    sklearn_test_scores.append(sklearn_avg_accuracy)
    # calculate average and store in overall scores list
    my_avg_accuracy = np.array(my_accuracy).mean(axis=0)
    my_dt_test_scores.append(my_avg_accuracy)

for i in range(len(sklearn_test_scores)):
    print("sklearn Decision Tree accuracy score test ", i, ": ", sklearn_test_scores[i])
print("*" * 100)

for i in range(len(my_dt_test_scores)):
    print("My Decision Tree accuracy score test ", i, ": ", my_dt_test_scores[i])
print("*" * 100)

Iris Dataset
sklearn fold  0  accuracy:  1.0
My DT Fold 0  accuracy: 1.0
sklearn fold  1  accuracy:  0.8666666666666667
My DT Fold 1  accuracy: 0.8666666666666667
sklearn fold  2  accuracy:  0.9333333333333333
My DT Fold 2  accuracy: 0.9333333333333333
sklearn fold  3  accuracy:  1.0
My DT Fold 3  accuracy: 1.0
sklearn fold  4  accuracy:  1.0
My DT Fold 4  accuracy: 1.0
sklearn fold  5  accuracy:  0.9333333333333333
My DT Fold 5  accuracy: 0.9333333333333333
sklearn fold  6  accuracy:  1.0
My DT Fold 6  accuracy: 1.0
sklearn fold  7  accuracy:  0.9333333333333333
My DT Fold 7  accuracy: 0.9333333333333333
sklearn fold  8  accuracy:  0.9333333333333333
My DT Fold 8  accuracy: 0.9333333333333333
sklearn fold  9  accuracy:  0.9333333333333333
My DT Fold 9  accuracy: 0.8
sklearn fold  0  accuracy:  0.9333333333333333
My DT Fold 0  accuracy: 1.0
sklearn fold  1  accuracy:  0.9333333333333333
My DT Fold 1  accuracy: 0.9333333333333333
sklearn fold  2  accuracy:  0.9333333333333333
My DT Fold

### WINE DATASET 10 X 10-FOLD CROSS VALIADATION

In [71]:
#Wine dataset
print("Wine Dataset")
# initialize sklearn test scores list
sklearn_test_scores = []
# initialize my decision tree test scores
my_dt_test_scores = []
# 10 different tests on a 10 fold cross validation
for times in range(10):
    # initialize current accuracy lists shuffle dataset
    sklearn_accuracy = []
    my_accuracy = []
    dataset2 = dataset2.sample(frac=1)
    dataset2 = dataset2.sample(frac=1)
    # split into 10 parts
    parts2 = cross_validator(cv, test_size2, dataset2)
    for fold in range(cv):
        # initialize scikit learn decision tree classifier
        scikitDT = Scikit_DTC()
        # iniitalize the decision tree classifie
        dt = DecisionTreeClassifier()
        # test part for the set on particular fold
        test = parts2[fold]
        # combine all parts except for the test part
        train_list  = [parts2[train_part] for train_part in range(cv) if fold != train_part]
        # train set
        train = pd.concat(train_list)
        # split into X for the features and y for target class for both test and train sets
        X_test = test.iloc[:, :-1].astype(float).reset_index(drop=True)
        y_test = test.iloc[:, -1].reset_index(drop=True)
        # both X train and X test sets have the datatype set to float so that numpy can operate
        X_train = train.iloc[:, :-1].astype(float).reset_index(drop=True)
        y_train = train.iloc[:, -1].reset_index(drop=True)
        # scikit learn fits the model then predics returning a list of predictions for y
        # adds the reults to scikit-learn list
        scikitDT.fit(X_train, y_train)
        sky_pred = scikitDT.predict(X_test)
        sklearn_acc = accuracy_score(sky_pred, y_test)
        print("sklearn fold ", fold, " accuracy: ", sklearn_acc)
        sklearn_accuracy.append(sklearn_acc)
        # the decision tree classifier train
        dt.fit(X_train, y_train, datatype2)
        # the decision tree classifier predictions
        y_pred = dt.predict(X_test)
        # append score to mydt scores list
        my_acc = accuracy_score(y_pred, y_test)
        my_accuracy.append(my_acc)
        print("My DT Fold", fold, " accuracy:", my_acc)
    # calculate average of 10 for cross validation
    sklearn_avg_accuracy = np.array(sklearn_accuracy).mean(axis=0)
    # add to overall scores list
    sklearn_test_scores.append(sklearn_avg_accuracy)
    # calculate average and store in overall scores list
    my_avg_accuracy = np.array(my_accuracy).mean(axis=0)
    my_dt_test_scores.append(my_avg_accuracy)

for i in range(len(sklearn_test_scores)):
    print("sklearn Decision Tree accuracy score test ", i, ": ", sklearn_test_scores[i])
print("*" * 100)

for i in range(len(my_dt_test_scores)):
    print("My Decision Tree accuracy score test ", i, ": ", my_dt_test_scores[i])
print("*" * 100)

Wine Dataset
sklearn fold  0  accuracy:  0.8823529411764706
My DT Fold 0  accuracy: 0.9411764705882353
sklearn fold  1  accuracy:  0.8235294117647058
My DT Fold 1  accuracy: 0.8823529411764706
sklearn fold  2  accuracy:  1.0
My DT Fold 2  accuracy: 0.8235294117647058
sklearn fold  3  accuracy:  1.0
My DT Fold 3  accuracy: 1.0
sklearn fold  4  accuracy:  0.9411764705882353
My DT Fold 4  accuracy: 0.7647058823529411
sklearn fold  5  accuracy:  0.9411764705882353
My DT Fold 5  accuracy: 0.7058823529411765
sklearn fold  6  accuracy:  1.0
My DT Fold 6  accuracy: 0.9411764705882353
sklearn fold  7  accuracy:  0.8235294117647058
My DT Fold 7  accuracy: 0.9411764705882353
sklearn fold  8  accuracy:  0.8235294117647058
My DT Fold 8  accuracy: 0.9411764705882353
sklearn fold  9  accuracy:  1.0
My DT Fold 9  accuracy: 1.0
sklearn fold  0  accuracy:  0.8823529411764706
My DT Fold 0  accuracy: 1.0
sklearn fold  1  accuracy:  1.0
My DT Fold 1  accuracy: 1.0
sklearn fold  2  accuracy:  0.823529411764

### BREAST-CANCER DATASET 10 X 10-FOLD CROSS VALIADATION

In [72]:
#Breast-Cancer dataset
print("Breast Cancer Dataset")
sklearn_test_scores = []
my_dt_test_scores = []
for times in range(10):
    sklearn_accuracy = []
    my_accuracy = []
    dataset3 = dataset3.sample(frac=1)
    dataset3 = dataset3.sample(frac=1)
    parts3 = cross_validator(cv, test_size3, dataset3)
    for fold in range(cv):
        scikitDT = Scikit_DTC()
        dt = DecisionTreeClassifier()
        # test part for the set on particular fold
        test = parts3[fold]
        # combine all parts except for the test part
        train_list  = [parts3[train_part] for train_part in range(cv) if fold != train_part]
        # train set
        train = pd.concat(train_list)
        # split into X for the features and y for target class for both test and train sets
        X_test = test.iloc[:, :-1].astype(float).reset_index(drop=True)
        # print(X_test)
        y_test = test.iloc[:, -1].reset_index(drop=True)
        # both X train and X test sets have the datatype set to float so that numpy can operate
        X_train = train.iloc[:, :-1].astype(float).reset_index(drop=True)
        # print(X_train)
        y_train = train.iloc[:, -1].reset_index(drop=True)
        scikitDT.fit(X_train, y_train)
        sky_pred = scikitDT.predict(X_test)
        sklearn_acc = accuracy_score(sky_pred, y_test)
        print("sklearn fold ", fold, " accuracy: ", sklearn_acc)
        sklearn_accuracy.append(sklearn_acc)
        # print(datatype3)
        dt.fit(X_train, y_train, datatype3)
        y_pred = dt.predict(X_test)
        my_acc = accuracy_score(y_pred, y_test)
        my_accuracy.append(my_acc)
        print("My DT Fold", fold, " accuracy:", my_acc)
    sklearn_avg_accuracy = np.array(sklearn_accuracy).mean(axis=0)
    sklearn_test_scores.append(sklearn_avg_accuracy)
    my_avg_accuracy = np.array(my_accuracy).mean(axis=0)
    my_dt_test_scores.append(my_avg_accuracy)

for i in range(len(sklearn_test_scores)):
    print("sklearn Decision Tree accuracy score test ", i, ": ", sklearn_test_scores[i])
print("*" * 100)

for i in range(len(my_dt_test_scores)):
    print("My Decision Tree accuracy score test ", i, ": ", my_dt_test_scores[i])
print("*" * 100)

Breast Cancer Dataset
sklearn fold  0  accuracy:  0.6071428571428571
My DT Fold 0  accuracy: 0.5
sklearn fold  1  accuracy:  0.6785714285714286
My DT Fold 1  accuracy: 0.7857142857142857
sklearn fold  2  accuracy:  0.6785714285714286
My DT Fold 2  accuracy: 0.6428571428571429
sklearn fold  3  accuracy:  0.6071428571428571
My DT Fold 3  accuracy: 0.7857142857142857
sklearn fold  4  accuracy:  0.6071428571428571
My DT Fold 4  accuracy: 0.6428571428571429
sklearn fold  5  accuracy:  0.6071428571428571
My DT Fold 5  accuracy: 0.6785714285714286
sklearn fold  6  accuracy:  0.6428571428571429
My DT Fold 6  accuracy: 0.7857142857142857
sklearn fold  7  accuracy:  0.6071428571428571
My DT Fold 7  accuracy: 0.6428571428571429
sklearn fold  8  accuracy:  0.6428571428571429
My DT Fold 8  accuracy: 0.75
sklearn fold  9  accuracy:  0.7857142857142857
My DT Fold 9  accuracy: 0.7857142857142857
sklearn fold  0  accuracy:  0.75
My DT Fold 0  accuracy: 0.7142857142857143
sklearn fold  1  accuracy:  0.5