In [80]:
import numpy as np
from numpy import log2 as log
import pandas as pd
from sklearn import tree, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
eps = np.finfo(float).eps

In [81]:
df = pd.read_csv('train.csv')

In [82]:
Y = df.left
X = df.drop(['left','number_project','last_evaluation','satisfaction_level','average_montly_hours','time_spend_company'], axis=1)

In [83]:
Z = pd.concat([X,pd.get_dummies(X['sales'],prefix='sales')],axis=1)
Z = pd.concat([Z,pd.get_dummies(Z['salary'],prefix='salary')],axis=1)
Z = Z.drop(['sales','salary'],axis=1)

In [84]:
X_train, X_test, Y_train, Y_test = train_test_split(Z, Y,test_size=0.2)
df1 = pd.concat([X_train, Y_train],axis=1)

In [85]:
def class_entropy(df):
    class_label = df.keys()[-1]
    class_entropy = 0
    values = df[class_label].unique()
    for val in values:
        q = float(df[class_label].value_counts()[val])/len(df[class_label])
        class_entropy += -q*log(q)
    return class_entropy

In [86]:
def feature_entropy(df, feature):
    class_label = df.keys()[-1]
    target_variables = df[class_label].unique()
    variables = df[feature].unique()
    entropy = 0
    for var in variables:
        ent = 0
        for t in target_variables:
            n = len(df[feature][df[feature]==var][df[class_label]==t])
            d = len(df[feature][df[feature]==var])
            q = n/(d+eps)
            ent += -q*log(q+eps)
        q2 = float(d)/len(df)
        entropy += -q2*ent
    return abs(entropy)

In [87]:
def feature_to_select(df):
    entropy_attr = []
    gain = []
    for key in df.keys()[:-1]:
        gain.append(class_entropy(df)-feature_entropy(df,key))
    return df.keys()[:-1][np.argmax(gain)]

In [88]:
def subtable(df,node,value):
    return df[df[node]==value].reset_index(drop=True)

In [89]:
class Node:
    def __init__(self,feature,positive=0,negative=0):
        self.feature=feature
        self.positive=positive
        self.negative=negative
        self.left=None
        self.right=None

In [90]:
def build_Tree(df):
    if len(df.columns)==1:
        return None
    
    split_node = feature_to_select(df)
    root = Node(split_node)
    
    subtable_left = subtable(df,split_node,0)
    subtable_right = subtable(df,split_node,1)
    
    subtable_left = subtable_left.drop(split_node,axis=1)
    subtable_right = subtable_right.drop(split_node,axis=1)
    
    clValue_left,counts_left = np.unique(subtable_left['left'],return_counts=True)
    clValue_right,counts_right = np.unique(subtable_right['left'],return_counts=True)
    
    if len(counts_left)<=1:
        if len(counts_left)==1:
            if clValue_left[0]==0:
                root.negative=counts_left[0]
            else:
                root.positive=counts_left[0]
    else:
        root.left=build_Tree(subtable_left)
        
    if len(counts_right)<=1:
        if len(counts_right)==1:
            if clValue_right[0]==0:
                root.negative=counts_right[0]
            else:
                root.positive=counts_right[0]
    else:
        root.right=build_Tree(subtable_right)
        
    return root

In [92]:
root=build_Tree(df1)

In [93]:
def rec_predict(df,root,prediction):
    if root==None:
        return None
    
    if root.right==None or root.left==None:
        prediction.append(1 if root.positive>root.negative else 0)
        return
    
    if df[root.feature]==0:
        rec_predict(df,root.left,prediction)
    else:
        rec_predict(df,root.right,prediction)
        
def predict(df,root,prediction):
    for col,row in df.iterrows():
        rec_predict(row,root,prediction)

In [96]:
prediction = []
predict(X_test,root,prediction)

print confusion_matrix(Y_test,prediction)
print classification_report(Y_test,prediction)
print accuracy_score(Y_test,prediction)

[[1689   14]
 [ 545    0]]
              precision    recall  f1-score   support

           0       0.76      0.99      0.86      1703
           1       0.00      0.00      0.00       545

   micro avg       0.75      0.75      0.75      2248
   macro avg       0.38      0.50      0.43      2248
weighted avg       0.57      0.75      0.65      2248

0.7513345195729537


In [95]:
model = tree.DecisionTreeClassifier()

model.fit(X_train, Y_train)
Y_predict = model.predict(X_test)
print confusion_matrix(Y_test,Y_predict)
print classification_report(Y_test,Y_predict)
print accuracy_score(Y_test,Y_predict)

[[1701    2]
 [ 545    0]]
              precision    recall  f1-score   support

           0       0.76      1.00      0.86      1703
           1       0.00      0.00      0.00       545

   micro avg       0.76      0.76      0.76      2248
   macro avg       0.38      0.50      0.43      2248
weighted avg       0.57      0.76      0.65      2248

0.7566725978647687
