In [None]:
#using sklearn module
import pandas as pd

df = pd.read_csv("../input/apndcts/apndcts.csv")

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

X = df.iloc[:,0:7]
y = df.iloc[:,7]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

model = DecisionTreeClassifier(criterion = 'entropy', random_state = 68, max_depth = 3, min_samples_leaf = 5)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Predicted array is:")
print(y_pred)
print("Accuracy score of our model is:",(accuracy_score(y_pred, y_test)*100).round(3),'%')
print("F1 score of our model is:",(f1_score(y_pred, y_test)*100).round(3),'%')

In [None]:
#doing from scratch
import pandas as pd
import numpy as np
import math as m
from sklearn.model_selection import train_test_split

#reading the file into pandas dataframe
df = pd.read_csv("../input/apndcts/apndcts.csv")

#function to split a dataset in two parts
def group_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] <= value:
            left.append(row)
        else:
            right.append(row)
    return left, right

#finding information gain by entropy over a split of dataset
def information_gain(groups): #i should have used gini instead of entropy but...
    entropy_value = []
    for group in groups:
        size = float(len(group))
        if size == 0:
            p = 0.5
        else:
            one = 0
            for row in group:
                if(row[7] == 1.0):
                    one += 1
            p = one/size
        if (p == 0 or p == 1):
            entropy = 0
        else:
            entropy = -(p*m.log2(p)+(1-p)*m.log2(1-p)) #using log is troublesome
        entropy_value.append(round(entropy,3)) #entropy for both left and right group
    one = 0
    s1, s2 = len(groups[0]), len(groups[1])
    for group in groups:
        for row in group:
            if(row[-1] == 1): one += 1
    p = one/(s1+s2)
    entropy_set = -(p*m.log2(p)+(1-p)*m.log2(1-p)) #entropy for whole dataset
    ig = entropy_set - (s1*entropy_value[0]+s2*entropy_value[1])/(s1+s2) #information gain on split
    return round(ig,5)

#finding best possible split in a dataset based on information gain
def database_split(ds):
    b_in, b_value, b_ig, b_groups = 0, 0, 0, None #variables to return the best split
    for index in range(len(ds[0])-1):
        for row in ds:
            groups = group_split(index, row[index], ds) #spliting in groups
            inform_gain = information_gain(groups) #caluculating information gain
            if inform_gain > b_ig: #getting biggest information gain possible
                b_ig, b_in, b_value, b_groups = inform_gain, (index+1), row[index].round(3), groups
    #returning in form of dict to use less variables  and easy access
    return {'index':b_in, 'value':b_value, 'groups':b_groups}

#converting a node to a leaf node and declaring its value
def make_leaf(group):
    targets = [row[-1] for row in group] #getting all values for rows in group
    return max(set(targets), key=targets.count) #finding maximum occured value and assigning it to test

#recursively creating nodes until pruning conditions are met
def create_nodes(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    #if any group is empty
    if not left or not right:
        node['left'] = node['right'] = make_leaf(left+right)
        return
    #if depth of node is reached to maximum
    if depth >= max_depth:
        node['left'], node['right'] = make_leaf(left), make_leaf(right)
        return
    #processing left group
    if len(left) <= min_size:
        #if group size limit is reached
        node['left'] = make_leaf(left)
    else:
        #create new children nodes for this node
        node['left'] = database_split(left)
        create_nodes(node['left'], max_depth, min_size, depth+1)
    #processing right group
    if len(right) <= min_size:
        #if group size limit is reached
        node['right'] = make_leaf(right)
    else:
        #create new children nodes for this node
        node['right'] = database_split(right)
        create_nodes(node['right'], max_depth, min_size, depth+1)
    
#finding the best first split and then processing each group for further split creating a tree of nodes
def build_tree(ds_train, max_depth, min_size):
    root = database_split(ds_train) #creating the first split at root
    create_nodes(root, max_depth, min_size, 1) #building rest of the tree
    return root #returning root for access

#predicting for a row based on the training dataset
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):#to check if current node is a children node or not
            return predict(node['left'], row) #if a children node we go further
        else:
            return node['left']#if a leaf node we take the leaf value
    else: #same with right side
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

#finding scores to evaluate a given predicted array or to score the model
def check_score(list1, list2):
    TP = 0 #true positive
    for i in range(list2.size):
        if((list1[i]==list2[i])and(list1[i]==1)):
            TP += 1

    FP = np.sum(list1) - TP #false positive
    TN = (list1==list2).sum() - TP #true negative
    FN = list2.size - (TP + FP + TN) #false negative

    accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score = 2*precision*recall/(precision+recall)

    #returning a dict so dont have to use multiple variables
    return {'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1_score':f1_score}

#spliting the whole dataset into training and testing group
df_train, df_test = train_test_split(df, test_size=0.25, random_state=100)

#creating a list of outcomes for test dataset to evaluate the model later
y_test = df_test.iloc[:,7].to_numpy()

#converting each dataset to numpy 2d arrays for simple iteration
np_df_train = df_train.to_numpy()
np_df_test = df_test.to_numpy()

max_depth = 2
min_size = 5
#building the model
tree = build_tree(np_df_train, max_depth, min_size)
y_pred = [] #creating the prediction list
#testing each row of test dataset in the build model and storing the predicted outcome
for row in np_df_test:
    output = predict(tree, row)
    y_pred.append(int(output))

#here is your prediction
print("predicted array is:")
print(y_pred)

#here is your evaluation of model
score = check_score(y_pred, y_test)

#here is outcome
print('accuracy of our model is:',score['accuracy']*100,'%')
print('f1_score of our model is:',score['f1_score']*100,'%')