<a href="https://colab.research.google.com/github/rposhala/Decision-Tree-Algorithm-and-Bagging-Concepts-using-Python/blob/master/Decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#loading libraries
import numpy as np
import pandas as pd
from pprint import pprint

# Decision Tree algorithm using NumPy

# Function to calculate gini index

In [0]:
def gini(df):
        x_unique,x_count = np.unique(df[:,-1], return_counts=True)
        x_prob = x_count/np.sum(x_count)
        x_gini = 1 - (np.dot(x_prob,x_prob))
        return x_gini


# Function picks the best value to split the real valued attribute  

In [0]:
def pick_the_value(df,n):
        n_best = []
        loss = 1000 # loss would not be more than 1 as gini is atmost 0.5
        unique = np.sort(np.unique(df[:,n]))
        split_values = [((unique[index]+unique[index-1])/2) for index in range(1,len(unique))]
        if len(split_values) > 100 :
            strip = list(range(1,100))
            steps = np.ptp(df[:,n])/100
            mini = np.min(df[:,n])
            split_values = mini+(np.asarray(strip)*steps)

        for i in split_values:
            above = df[df[:,n] > i]
            below = df[df[:,n] <= i] 
            a_gini = gini(above)
            b_gini = gini(below)
            a_len = len(above)
            b_len = len(below)
            current_loss = ((a_len/(a_len+b_len))*a_gini) + ((b_len/(a_len+b_len))*b_gini)
            
            if current_loss < loss:
                loss = current_loss
                n_best = [i,loss,above,below,n]
        return n_best

# Function to pick the best node(attribute) to split

In [0]:
def decision_node(df):
        node_split = []
        par_loss = 1000
        col_no = df.shape[1] - 1
        par_list = np.random.choice(col_no,int(np.floor(np.sqrt(col_no))),replace=False)
        for i in par_list:
            i_best = pick_the_value(df,i)

            if len(i_best)!=0 :
                if i_best[1] < par_loss:
                    par_loss = i_best[1]
                    node_split = i_best
        return node_split


# Decision Tree algorithm using the above functions (without pruning)

In [0]:
#displaying the decision tree with tree pruning option (we can decide till which height the tree can go before stop, it decides the label based on the majority)
    
def decisiontree(df):
        x_unique = np.unique(df[:,-1])
        if len(x_unique) == 1:
            return x_unique[0]
        else :
            # c+=1
            split = decision_node(df)
            col_no = split[4]
            value = split[0]
            df_above = split[2]
            df_below = split[3]
            condition = "{} <= {}".format(col_no,value)
            decision_tree = {condition : []}
            true = decisiontree(df_below)
            false = decisiontree(df_above)

            decision_tree[condition].append(true)
            decision_tree[condition].append(false)

            return decision_tree


# Decision Tree algorithm using the above functions (with pruning)

In [0]:
def decisiontreeprune(df,c = 0, m = 3):
        x_unique = np.unique(df[:,-1])
        if len(x_unique) == 1 or c == m:
            return x_unique[0]
        else :
            c+=1
            split = decision_node(df)
            col_no = split[4]
            value = split[0]
            df_above = split[2]
            df_below = split[3]
            condition = "{} <= {}".format(col_no,value)
            decision_tree = {condition : []}
            true = decisiontreeprune(df_below,c,m)
            false = decisiontreeprune(df_above,c,m)

            decision_tree[condition].append(true)
            decision_tree[condition].append(false)

            return decision_tree

# Function to pass a test sample is being passed through the decision tree and label is assigned

In [0]:
## test sample is being passed through the decision tree and label is assigned
    def classify(test_sample,tree):
        condition = list(tree.keys())[0]
        column_number = condition.split()[0]
        value = condition.split()[2]
        if test_sample[int(column_number)] <= float(value):
            label = tree[condition][0]
        else :
            label = tree[condition][1]
        if type(label) == dict:
            sub_tree = label
            return classify(test_sample,sub_tree)
        else :
            return label

# Function to assign labels for a entire test data

In [0]:

def predict(x_test,tree):
    y_pred = []#np.asarray([])
    for test in x_test:
        y_pred.append(classify(test,tree))
        
    return y_pred

Function defined to calculate Confusion Matrix

In [0]:
def ConfusionMatrix(y_true,y_pred):
    
    
    classes = 11
    modified_list = ((y_true-1)*classes) + (y_pred-1)
    squared_no_of_classes = classes**2
    confusion_matrix = np.histogram(modified_list, bins=np.arange(squared_no_of_classes+1))[0]
    confusion_matrix = confusion_matrix.reshape(classes,classes)
    return confusion_matrix

# Function defined to calculate Accuracy

In [0]:
def Accuracy(y_true,y_pred):

    nppred = np.asarray(y_pred)
    c = y_true - nppred
    misclass = np.count_nonzero(c)
    test_len = len(y_true)
    accuracy = 1 - (misclass/test_len)
    return accuracy
    

Function defined to calculate Recall

In [0]:
def Recall(y_true,y_pred):

    confusion_matrix = ConfusionMatrix(y_true,y_pred)
    macro_recall = np.sum(np.divide(np.diag(confusion_matrix),np.sum(confusion_matrix,axis=1)))/len(confusion_matrix)
    return macro_recall

Function defined to calculate Precision

In [0]:

def Precision(y_true,y_pred):

    confusion_matrix = ConfusionMatrix(y_true,y_pred)
    macro_precision = np.sum(np.divide(np.diag(confusion_matrix),np.sum(confusion_matrix,axis=0)))/len(confusion_matrix)
    return macro_precision

# Loading a dataset using Pandas to test the Decision Tree algorithm

In [0]:
# np.random.seed(4563)
data = pd.read_csv("data.csv", header = None , skiprows = 1)
dataframe = pd.DataFrame()
dataframe = pd.DataFrame(data)
Y = dataframe.pop(dataframe.columns[-1])
ind = np.random.choice(len(dataframe),len(dataframe)*8//10,replace=False)
df = np.asarray(dataframe)
label = np.asarray(Y)
X_train = df[ind]
X_test = np.delete(df,ind,axis = 0)
Y_train = label[ind]
Y_test = np.delete(label,ind,axis = 0)
# train_len = len(x_train)
# test_len = len(x_test)
# col_len = x_train.shape[1]
x_train_norm = (X_train - np.mean(X_train,axis = 0)[np.newaxis,:])/np.std(X_train,axis=0)[np.newaxis,:]
x_test_norm = (X_test - np.mean(X_test,axis = 0)[np.newaxis,:])/np.std(X_test,axis=0)[np.newaxis,:]

In [0]:
Y_train = np.asarray([Y_train])
Y_train = np.swapaxes(Y_train,0,1)
df = np.append(X_train,Y_train,axis=1)


# Displaying entire decision tree

In [17]:
decisiontree = decisiontree(df)
pprint(decisiontree)

{'7 <= -0.011323000000000014': [{'10 <= -0.11505341999999999': [{'43 <= -1.486075': [11.0,
                                                                                     2.0]},
                                                                {'6 <= -0.02476538': [{'35 <= -0.010177700000000001': [{'19 <= 1.3915440000000001': [{'23 <= 1.3459480000000001': [{'47 <= -1.4963000000000002': [{'4 <= 6.17855e-06': [2.0,
                                                                                                                                                                                                                                        10.0]},
                                                                                                                                                                                                                  2.0]},
                                                                                                                           

# Displaying pruned decision tree

In [18]:
pruneddecisiontree = decisiontreeprune(df,c = 0, m = 3)
pprint(pruneddecisiontree)

{'35 <= -0.016697399999999998': [{'1 <= 2.1000100000000002e-05': [{'45 <= -1.493956': [2.0,
                                                                                       1.0]},
                                                                  {'45 <= -1.495666': [2.0,
                                                                                       2.0]}]},
                                 {'23 <= 2.3243067': [{'10 <= 0.06623110000000001': [1.0,
                                                                                     4.0]},
                                                      {'47 <= -1.499432': [4.0,
                                                                           4.0]}]}]}


# Predicting labels for test data using decision tree & displaying calculated Accuracy

In [20]:
y_pred_dt = predict(X_test,decisiontree)
print("Accuracy :",Accuracy(Y_test,y_pred_dt))

Accuracy : 0.981201171875


# Predicting labels for test data using pruned decision tree & displaying calculated Accuracy

In [21]:
y_pred_dt_pruned = predict(X_test,pruneddecisiontree)
print("Accuracy :",Accuracy(Y_test,y_pred_dt_pruned))


Accuracy : 0.1134033203125


*********** The end ***

..

..