# Train decision tree only on categorical data. Report precision,recall, f1 score and accuracy.

In [2]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.model_selection import train_test_split 
dataset = pd.read_csv("../input_data/train.csv") 
dataset = dataset[['promotion_last_5years', 'Work_accident', 'sales', 'salary','left']]

In [3]:
def entropy(column):
    elements,counts = np.unique(column,return_counts = True)
    entropy = 0
    for i in range(len(elements)):
        entropy += -(counts[i]/np.sum(counts))*(np.log2(counts[i]/np.sum(counts)))
    return entropy

In [4]:
def InfoGain(data,split,target="left"):
    total_entropy = entropy(data[target])
    vals,counts= np.unique(data[split],return_counts=True)
    Weighted_Entropy = 0
    for i in range(len(vals)):
        weight = counts[i]/np.sum(counts)
        ent = entropy(data.where(data[split]==vals[i]).dropna()[target])
        Weighted_Entropy += weight*ent
    InfoGain = total_entropy - Weighted_Entropy
    return InfoGain

In [5]:
def train_validate_test_split(dataset):
    size = len(dataset)
    tsize = int(size*0.6)
    vsize = int(size*0.8)
    training_data = dataset.iloc[:tsize].reset_index(drop=True)
    validation_data = dataset.iloc[tsize:vsize].reset_index(drop=True)
    testing_data = dataset.iloc[vsize:].reset_index(drop=True)
    return training_data,validation_data,testing_data

In [6]:
def createtree(subdata,data,attributes,label="left",parent = None):
    if len(np.unique(subdata[label])) <= 1:
        return np.unique(subdata[label])[0]
    elif len(subdata)==0:
        return np.unique(data[label])[np.argmax(np.unique(data[label],return_counts=True)[1])]  
    elif len(attributes) ==0:
        return parent
    else:
        parent = np.unique(subdata[label])[np.argmax(np.unique(subdata[label],return_counts=True)[1])]
        item = [InfoGain(subdata,attribute,label) for attribute in attributes] 
        selected_attribute_index = np.argmax(item)
        selected_attribute = attributes[selected_attribute_index]
        tree = {selected_attribute:{}}
        attributes = [i for i in attributes if i != selected_attribute]
        for value in np.unique(subdata[selected_attribute]):
            value = value
            sub_data = subdata.where(subdata[selected_attribute] == value).dropna()
            subtree = createtree(sub_data,dataset,attributes,label,parent)
            tree[selected_attribute][value] = subtree
        return(tree)

In [7]:
def predict(sample,tree,default = 1):
    for key in list(sample.keys()):
        if key in list(tree.keys()):
            try:
                prediction = tree[key][sample[key]] 
            except:
                return default
            prediction = tree[key][sample[key]]
            if isinstance(prediction,dict):
                return predict(sample,prediction)
            else:
                return prediction

In [8]:
def safe_div(x,y):
    if y == 0:
        return 0
    return x / y

In [9]:
def stats(data,tree):
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    predicted = pd.DataFrame(columns=["predicted"]) 
    for i in range(len(data)):
        predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0) 
    TP,TN,FP,FN = 0,0,0,0
    for i in range(len(data)):
        if predicted["predicted"].iloc[i] == 0.0:
            if data['left'].iloc[i] == 0:
                TN += 1
            else:
                FN += 1
        else:
            if data['left'].iloc[i] == 0:
                FP += 1
            else:
                TP += 1
    classification_error = safe_div((FP+FN),(TP+FP+TN+FN)) 
    accuracy = safe_div((TP+TN),(TP+FP+TN+FN)) 
    recall = safe_div(TP,(TP+FN))
    precision = safe_div(TP,(TP+FP))
    f1_score = safe_div(2,(safe_div(1,precision))+safe_div(1,recall))
    #print(TP,TN,FP,FN)
    print("Classification error:",classification_error)
    print("Accuracy:",accuracy)
    print("Recall:",recall)
    print("Precision:",precision)
    print("F1 Score:",f1_score)

In [10]:
if __name__ == "__main__":
    training_data,validation_data,testing_data = train_validate_test_split(dataset)
    print(len(training_data),len(validation_data),len(testing_data))
    tree = createtree(training_data,training_data,training_data.columns[:-1])
    print("Performance on training data")
    stats(training_data,tree)
    print("\nPerformance on validation data")
    stats(validation_data,tree)
    print("\nPerformance on testing data")
    stats(testing_data,tree)

6742 2248 2248
Performance on training data
Classification error: 0.23524176802135865
Accuracy: 0.7647582319786413
Recall: 0.000630119722747322
Precision: 1.0
F1 Score: 0.0012594458438287153

Performance on validation data
Classification error: 0.24154804270462635
Accuracy: 0.7584519572953736
Recall: 0.001841620626151013
Precision: 0.5
F1 Score: 0.003669724770642202

Performance on testing data
Classification error: 0.24243772241992884
Accuracy: 0.7575622775800712
Recall: 0.003669724770642202
Precision: 0.5
F1 Score: 0.007285974499089253
