In [1]:
import pandas as pd
import numpy as np

In [2]:
#Check the categorical data
def check_categorical(data):
    categorical_index = []
    for i,value in enumerate(data.dtypes):
        if(value == 'object'):
            categorical_index.append(i)
    j = 0
    data = data.to_numpy()
    while j < len(categorical_index):
        temp = data[:,categorical_index[j]]
        u_categorical = np.unique(temp)
        num_indexes = []
        for i,value in enumerate(u_categorical):
            num_indexes.append(i)
        dictionary = dict(zip(u_categorical, num_indexes)) 
        for k in range(len(temp)):
            data[k][categorical_index[j]] = dictionary.get(data[k][categorical_index[j]])
        j+=1
    
    return data.astype(float),categorical_index

In [3]:
#Calculation of GINI Index
def gini_helper(node):
    score = 0.0
    size = float(len(node))
    zero = list(node[:,-1]).count(0)/float(len(node))
    one = list(node[:,-1]).count(1) / float(len(node))
    score += zero*zero 
    score+= one * one
    return score
 
def calc_gini_index(left_n,right_n):
    gini_index, total = 0, float(len(left_n) + len(right_n))
    if(len(left_n)):
        l_score = gini_helper(left_n)
        gini_index += (1.0 - l_score) * (float(len(left_n)) / total)
    if(len(right_n)):
        r_score = gini_helper(right_n)
        gini_index += (1.0 - r_score) * (float(len(right_n)) / total)
    
    return gini_index

In [4]:
# Split a dataset based on an attribute and an attribute value
def split(train_dataset,split_value, index, categorical_index):
    left = []
    right = []
    for r in train_dataset:
        if index not in categorical_index:
            _ = [left.append(r) if r[index] < split_value else right.append(r)]
        else:
            _ = [left.append(r) if r[index] == split_value else right.append(r)]
    return left, right        

In [5]:
# Select the best split point for a dataset
def split_node(train_dataset, categorical_index):
    min_err = float('inf')
    split_attr, left, right, split_value = 999,None,None,999
    for index in range(train_dataset.shape[1]-1):
        for row in train_dataset:
            l, r = split(train_dataset, row[index], index,categorical_index)
            l, r = np.asarray(l),np.asarray(r)
            err = calc_gini_index(l,r)
            if err < min_err:
                min_err = err
                left,right = l,r
                split_attr,split_value = index, row[index]    
    
    return {'split_attr': split_attr, 'left':left, 'right':right, 'split_value':split_value}

In [6]:
# Termninal Node
def to_terminal(left_n,right_n):
    ones = 0
    zeroes = 0
    
    if(len(left_n)):
        zeroes += list(left_n[:,-1]).count(0)
        ones += list(left_n[:,-1]).count(1)
    if(len(right_n)):
        zeroes += list(right_n[:,-1]).count(0)
        ones += list(right_n[:,-1]).count(1)
    if(ones>zeroes):
        return 1
    else:
        return 0

#Build the decision tree from the root
def build_tree(node,categorical_index):
    l = node['left']
    del(node['left'])
    r = node['right']
    del(node['right'])
    
    if len(l) == 0 or len(r) == 0:
        node['left'] = node['right'] = to_terminal(l, r)
        return node 
    if len(set(l[:,-1])) == 1:
        node['left'] = to_terminal(l, [])
    else:
        node['left'] = build_tree(split_node(l,categorical_index),categorical_index)
    if len(set(r[:,-1])) == 1:
        node['right'] = to_terminal([], r)
    else:
        node['right'] = build_tree(split_node(r,categorical_index),categorical_index)
    return node


In [7]:
#Decision Tree Classification
def dt_classify(node, row):
    if row[node['split_attr']] < node['split_value']:
        if isinstance(node['left'],dict) == False:
            return node['left']
        else:
            return dt_classify(node['left'] , row)    

    if isinstance(node['right'],dict) == False:
        return node['right']
    else:
        return dt_classify(node['right'] , row)  
    
#Calculation of Confusion Matrix
def confusion_matrix(original, predicted):
    accuracy = precision = recall = f_measure = 0
    TP = FN = FP = TN = 0
    for i in range(len(original)):
        if original[i] == 1 and predicted[i] == 1:
            TP += 1
        elif original[i] == 1 and predicted[i] == 0:
            FN += 1
        elif original[i] == 0 and predicted[i] == 1:
            FP += 1
        else:
            TN += 1
            
    if(TP + FN + FP + TN) != 0:
        accuracy = (float(TP + TN)/(TP + FN + FP + TN))
    if(TP + FP) != 0:
        precision = (float(TP)/(TP + FP))
    if(TP + FN)!= 0:
        recall = (float(TP)/(TP + FN))
    f_measure = (float(2 * TP) / ((2 * TP) + FN + FP))
            
    return accuracy, precision, recall, f_measure
    

In [8]:
#Main Function
def __main__():
    accuracy = precision = recall = f_measure = 0
    
    #Load the file in a Data Frame
    file = 'project3_dataset1.txt'
    data = pd.read_csv(file,sep="\t", header=None)    
    data = data.iloc[:,:data.shape[1]]
    data, categorical_index = check_categorical(data)
    
    #Number of Folds
    folds = np.array_split(data,10)
    
    #Number of loops
    i=0
    while(i<len(folds)):
        #test_set
        test = np.asarray(folds[i])
        train = []
        predicted_labels = []
        for index,value in enumerate(folds):
            if(index != i):
                train.append(value)
     
        #train_set
        train = np.asarray(np.vstack(train))
        
        
        root = dict()
        root = split_node(train,categorical_index)
        root = build_tree(root,categorical_index)

        
        for item in range(len(test)):
            test_point = test[item]
            
            #Classification and Predication of the points
            final_class_labels = dt_classify(root, test_point)
            predicted_labels.append(final_class_labels)
            
        #Accuracy, Precision, Recall, Precision calculations
        acc, prec, rec, f_meas = confusion_matrix(test[:,-1], np.asarray(predicted_labels))
        accuracy += acc
        precision += prec
        recall += rec
        f_measure += f_meas
        
        i+=1
    
    #Outputs
    print('Decision Tree Results for the file {} --------------'.format(file))
    print('Accuracy : {}'.format(str(accuracy * 10)))
    print('Precision: {}'.format(str(precision * 10)))
    print('Recall : {}'.format(str(recall * 10)))
    print('F-1 Measure: {}'.format(str(f_measure * 10)))
        
    
__main__()

Decision Tree Results for the file project3_dataset1.txt --------------
Accuracy : 92.43421052631578
Precision: 91.47086922452317
Recall : 88.53501551356514
F-1 Measure: 89.79424987236926
