In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('train.csv', sep= ',', header = None)
test = pd.read_csv('test.csv', sep= ',', header = None)
train = (train[1:len(train)][:])
test = (test[1:len(test)][:])
train = train.apply(LabelEncoder().fit_transform)
test = test.apply(LabelEncoder().fit_transform)
#print(train)

In [3]:
X_train = train.values[:,0:3]
#print(X_train)
Y_train = train.values[:,4]
X_test = test.values[:,0:3]
Y_test = test.values[:,4]
train = list(np.concatenate((X_train,np.reshape((Y_train),(len(X_train),1))),axis = 1))
test = list(np.concatenate((X_test,np.reshape((Y_test),(len(X_test),1))),axis = 1))


In [4]:
# Using Scikit Learn Library for classifcation
def train_using_gini(X_train, y_train):
    # Creating the classifier object 
    dectree_gini = DecisionTreeClassifier(criterion = "gini") 
    # Performing training 
    dectree_gini.fit(X_train, y_train) 
    return dectree_gini

In [5]:
def train_using_entropy(X_train,Y_train):
    dectree_entropy = DecisionTreeClassifier(criterion = "entropy")
    dectree_entropy.fit(X_train,Y_train)
    return(dectree_entropy)

In [6]:
def prediction(X_test, model):
    y_pred = model.predict(X_test)
    print("Predicted Values:")
    print(y_pred)
    return(y_pred)

In [7]:
def cal_accuracy(y_test, y_pred):
    print("Confusion Matrix: \n ", confusion_matrix(y_test,y_pred))
    print("Accuracy: \n", accuracy_score(y_test,y_pred))
    print("Classification_Report: \n", classification_report(y_test,y_pred))

In [8]:
dec_tree_gini = train_using_gini(X_train, Y_train)
print("The predicted values through decision tree using gini index as a crtieria \n")
y_pred_gini = prediction(X_test, dec_tree_gini)
cal_accuracy(Y_test,y_pred_gini)



The predicted values through decision tree using gini index as a crtieria 

Predicted Values:
[0 1]
Confusion Matrix: 
  [[1 1]
 [0 0]]
Accuracy: 
 0.5
Classification_Report: 
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.00      0.00      0.00         0

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.25      0.33         2
weighted avg       1.00      0.50      0.67         2



  'recall', 'true', average, warn_for)


In [9]:
dec_tree_entropy = train_using_entropy(X_train, Y_train)

print("The predicted values through decision tree using entropy as a crtieria \n")
y_pred_entropy = prediction(X_test, dec_tree_entropy)
#print(train.apply(LabelEncoder().inverse_transform))

cal_accuracy(Y_test,y_pred_entropy)


The predicted values through decision tree using entropy as a crtieria 

Predicted Values:
[0 1]
Confusion Matrix: 
  [[1 1]
 [0 0]]
Accuracy: 
 0.5
Classification_Report: 
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.00      0.00      0.00         0

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.25      0.33         2
weighted avg       1.00      0.50      0.67         2



  'recall', 'true', average, warn_for)


In [12]:
##########################################################################
##########################################################################
## Writing Decision tree from scratch without using any python library  ##
##########################################################################
##########################################################################


### Creating a split on the basis of an attribute ###
def split_data(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index]==value:
            left.append(row)
        else:
            right.append(row)
    return( left, right)


### Computing
def gini_index(splits,values):
    total_count = sum([len(split) for split in splits])
    gini = 0.0
    for split in splits:
        size = len(split)
        if size == 0:
            continue
        score = 0
        for value in values:
            score += pow(([row[-1] for row in split].count(value)/size),2)
        gini += (1-score)*(size/total_count)
    return(gini)

def entropy(splits,values):
    total_count = sum([len(split) for split in splits])
    entropy = 0
    for split in splits:
        size = len(split)
        if size == 0:
            continue
        score = 0
        for value in values:
            entropy += ([row[-1] for row in split].count(value)/size)*np.log2(([row[-1] for row in split].count(value)/size))
        gain += -1*(entropy)*(size/total_count)
    return(gain)

def get_best_split_entropy(dataset): 
    y_label = list(set(row[-1] for row in dataset))
    best_index,best_value,best_score,best_splits = 99,99,99,None
    for index in range(len(dataset[0])-1):
        for i in range(2):
            splits = split_data(index,i,dataset)
            entropy = entropy(splits,y_label)
            if entropy > best_score:
                best_index = index
                best_value = i
                best_score = gini
                best_splits = splits
    return{'index':best_index,'value':best_value,'best_score':best_score,'splits':best_splits}


def get_best_split_gini(dataset): 
    y_label = list(set(row[-1] for row in dataset))
    best_index,best_value,best_score,best_splits = 99,99,99,None
    for index in range(len(dataset[0])-1):
        for i in range(2):
            splits = split_data(index,i,dataset)
            gini = gini_index(splits,y_label)
            if gini < best_score:
                best_index = index
                best_value = i
                best_score = gini
                best_splits = splits
    print("The best gini index for this node is", best_score)
    return{'index':best_index,'value':best_value,'best_score':best_score,'splits':best_splits}

    
def output_majority_class(split):
    y_label = [row[-1] for row in split]
    return(max(set(y_label), key = y_label.count))

def recursive_splitting_gini(node, max_depth,depth): 
    left, right = node['splits']
    del(node['splits'])
    if not left or not right:
        node['left'] = node['right'] = output_majority_class(left + right)
        return
    if depth >= max_depth:
        node['left'], node['right'] = output_majority_class(left),output_majority_class(right)
        return
    else:
        node['left'] = get_best_split_gini(left)
        recursive_splitting_gini(node['left'], max_depth,depth+1)
        node['right'] = get_best_split_gini(right)
        recursive_splitting_gini(node['right'], max_depth, depth+1)
        
        
def recursive_splitting_entropy(node, max_depth,depth): 
    left, right = node['splits']
    del(node['splits'])
    if not left or not right:
        node['left'] = node['right'] = output_majority_class(left + right)
        return
    if depth >= max_depth:
        node['left'], node['right'] = output_majority_class(left),output_majority_class(right)
        return
    else:
        node['left'] = get_best_split_entropy(left)
        recursive_splitting_entropy(node['left'], max_depth,depth+1)
        node['right'] = get_best_split_entropy(right)
        recursive_splitting_entropy(node['right'], max_depth, depth+1)

def build_tree_gini(train, max_depth):
    root = get_best_split_gini(train)
    recursive_splitting_gini(root, max_depth,1)
    return(root)

def build_tree_entropy(train, max_depth):
    root = get_best_split_entropy(train)
    recursive_splitting_entropy(root, max_depth,1)
    return(root)
    
def print_tree(node, depth=0):
	if isinstance(node, dict):
		print('%s|X%d = %d' % ((depth*'\t', (node['index']+1), int(node['value']))))
		print_tree(node['left'], depth+1)
		print_tree(node['right'], depth+1)
	else:
		print('%s:[%s]' % ((depth*'\t', node)))
    

# Make a prediction with a decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']

def prediction(test,tree):
    y_pred = []
    for row in test:
        y_pred.append(predict(tree,row))
    return(y_pred)


In [13]:
############
tree_gini = build_tree_gini(train,4)
print_tree(tree_gini)
type(test)
y_pred = prediction(test,tree_gini)
cal_accuracy(y_pred,Y_test)

The best gini index for this node is 0.41666666666666663
The best gini index for this node is 0.0
The best gini index for this node is 0.3666666666666666
The best gini index for this node is 0.0
The best gini index for this node is 0.0
The best gini index for this node is 0.0
The best gini index for this node is 0.26666666666666666
The best gini index for this node is 0.0
The best gini index for this node is 0.3333333333333333
|X2 = 1
	|X1 = 0
		:[1]
		:[1]
	|X1 = 0
		|X3 = 0
			|X1 = 0
				:[0]
				:[0]
			|X1 = 0
				:[1]
				:[1]
		|X1 = 1
			|X1 = 0
				:[0]
				:[0]
			|X2 = 0
				:[0]
				:[0]
Confusion Matrix: 
  [[1 0]
 [1 0]]
Accuracy: 
 0.5
Classification_Report: 
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33 