In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
import os
import math
import re
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

import warnings
warnings.filterwarnings('ignore')

In [88]:
BASE_DIR = '../'

train_path = os.path.join(BASE_DIR, 'data', 'bank_dataset', 'bank_train.csv')
test_path  = os.path.join(BASE_DIR, 'data', 'bank_dataset', 'bank_test.csv')
val_path   = os.path.join(BASE_DIR, 'data', 'bank_dataset', 'bank_val.csv')

In [89]:
def load_data(one_hot_encoding, numeric_cols, filename, values_dict = {}):
    df = pd.read_csv(filename, delimiter = ';')
    Y = df['y'].copy()
    Y  = Y.to_numpy()
    for i in range(Y.shape[0]):
        if Y[i] == 'yes':
            Y[i] = 1
        else:
            Y[i] = 0 #Assigning 0 to nan values
    
    Y = Y.astype('int64')
    df = df.drop(['y'],axis=1)
    
    if one_hot_encoding == False:
        return df, Y
    
    if one_hot_encoding == True and values_dict == {}:
        for col in df.columns:
            if col not in numeric_cols:
                values = list(set(list(df[col])))
                values_dict[col] = values
                
                for i in range(df.shape[0]):
                    df[col][i] = values_dict[col].index(df[col][i])
        return df, Y, values_dict
    
    elif one_hot_encoding == True:
        for col in df.columns:
            if col not in numeric_cols:
                for i in range(df.shape[0]):
                    if df[col][i] in values_dict[col]:
                        df[col][i] = values_dict[col].index(df[col][i])
                    else:
                        df[col][i] = -1
        return df, Y

In [90]:
def entropy_num(X,Y):
    median = np.median(X)
    boolean_flag = X > median
    # X_left  = X[boolean_flag == False]
    # X_right = X[boolean_flag == True]
    Y_left  = Y[boolean_flag == False]
    Y_right = Y[boolean_flag == True]
    
    p1,p2 = 0,0
    
    if Y_left.shape[0] > 0:
        s1 = np.sum(Y_left)
        s1 = max(s1, Y_left.shape[0] - s1)
        p1 = float(s1)/float(Y_left.shape[0])
        p1 = - p1 * np.log(p1)
        
        if Y_left.shape[0] > s1:
            p_t = float(Y_left.shape[0]-s1)/float(Y_left.shape[0])
            p_t = -p_t * np.log(p_t)        
            p1 += p_t
        
        p1 = p1*(float(Y_left.shape[0]))/float(Y.shape[0])
    
    if Y_right.shape[0] > 0:
        s2 = np.sum(Y_right)
        s2 = max(s2, Y_right.shape[0] - s2)
        p2 = float(s2)/float(Y_right.shape[0])
        p2 = - p2 * np.log(p2)
        
        if Y_right.shape[0] > s2:
            p_t = float(Y_right.shape[0]-s2)/float(Y_right.shape[0])
            p_t = -p_t * np.log(p_t)
            p2 += p_t
        
        p2 = p2*(float(Y_right.shape[0]))/float(Y_right.shape[0])
    
    # print(p1,p2,X,Y)
    
    return p1+p2

def entropy_categorical(X,Y):
    val = list(set(list(X)))
    val_count = dict.fromkeys(val,[0,0])
    
    for i in range(X.shape[0]):
        val_count[X[i]][1] += 1
        if Y[i] == 1:
            val_count[X[i]][0] += 1
    entr = 0

    for category,count in val_count.items():
        p = 0
        val_count[category][0] = max(count[0], count[1] - count[0])
        if count[1] > 0:
            p = float(val_count[category][0])/float(count[1])
            p = -p * np.log(p)

            p_t = float(count[1]-val_count[category][0])/float(count[1])
            p_t = -p_t * np.log(p_t)

            p += p_t
            
            p = p * (float(count[1]))/float(Y.shape[0])        
        entr += p
    
    return entr
    

In [91]:
def information_gain(attribute, one_hot_encoding, numeric_cols, parent_entr, indices, X, Y):
    X_new = np.array((X.iloc[indices])[attribute])
    Y_new = Y[indices]
    entr  = 0
    info_parent = parent_entr
    
    if attribute in numeric_cols:
        entr = entropy_num(X_new, Y_new)
        # print("Info Gain:",attribute,entr)
    else:
        if one_hot_encoding == False: # Multi split
            entr = entropy_categorical(X_new, Y_new)
        else:
            entr = entropy_num(X_new, Y_new)
    
    
    
    return info_parent - entr

In [92]:
def best_attribute(one_hot_encoding, rem_attr, numeric_cols, parent, indices, X, Y):
    best_attr = ''
    info_gain = -float('inf')
    
    parent_entr = 0
    # if parent == None:
    #     parent_entr = 0
    # elif parent.attr in numeric_cols:
    #     parent_entr = entropy_num
    
    for attr in X.columns:
        if attr in numeric_cols or attr in rem_attr:            
            temp = information_gain(attr, one_hot_encoding, numeric_cols, parent_entr, indices, X, Y)
            # print("Best_Attr Selection:",attr,temp)
            if temp > info_gain:
                info_gain = temp
                best_attr = attr
    
    return best_attr, info_gain

In [93]:
class dc_node:

    # indices coming at this node
    def __init__(self,parent,indices,depth,decision,median=0,value=None,attribute=None):
        self.parent = parent
        self.indices = indices

        self.child = []
        self.depth = depth
        self.attr  = attribute

        self.decision = decision
        self.median   = median
        self.value    = value

In [94]:
def construct_decision_tree(one_hot_encoding, rem_attr, numeric_cols, parent, indices, X, Y, MAX_DEPTH):
    Y_new = np.array(Y[indices])
    if np.sum(Y_new) > (Y_new.shape[0] - np.sum(Y_new)):
        decision = 1
    else:
        decision = 0
    
    if indices.shape[0] == 1 or (parent != None and parent.depth >= MAX_DEPTH):
        if parent == None:
            return dc_node(parent, indices, 1, decision)
        else:
            return dc_node(parent, indices, parent.depth+1, decision)
    else:
        attr,gain = best_attribute(one_hot_encoding, rem_attr, numeric_cols, parent, indices, X, Y)
        
        depth = 0
        if parent == None:
            depth = 0
        else:
            depth = parent.depth + 1
        node = dc_node(parent,indices,depth,decision,attribute=attr)
        
        print("Attr:",attr,",  Gain: ",gain,",  Depth:",depth)
        # print("Rem:",rem_attr)
        # print("Numeric:",numeric_cols)
        # print("Indices:",indices)
        
        # if gain > 0:
        X_new = np.array((X.iloc[indices])[attr])
        
        if attr in numeric_cols or (one_hot_encoding == True):
            median = np.median(X_new)
            node.median = median
            
            boolean_flag = X_new > median
            ind_left  = indices[boolean_flag == False]
            ind_right = indices[boolean_flag == True]
            
            if one_hot_encoding == True and attr not in numeric_cols:
                boolean_flag = (X_new % 2 == 0)
                ind_left  = indices[boolean_flag == False]
                ind_right = indices[boolean_flag == True]
                rem_attr.remove(attr)
                
            
            # print("Left: ",ind_left,ind_left.shape[0])
            # print("Right: ",ind_right,ind_right.shape[0])
            # print("Indices: ", indices, indices.shape[0])

            if ind_left.shape[0] > 0:
                left  = construct_decision_tree(one_hot_encoding, rem_attr.copy(), numeric_cols, node, ind_left, X, Y, MAX_DEPTH)
                left.value = 'left'
                node.child.append(left)
            
            if ind_right.shape[0] > 0:
                right = construct_decision_tree(one_hot_encoding, rem_attr.copy(), numeric_cols, node, ind_right, X, Y, MAX_DEPTH)
                right.value = 'right'
                node.child.append(right)
                                    
        elif one_hot_encoding == False and attr in rem_attr:
            rem_attr.remove(attr)
            val = list(set(list(X_new)))
            for i in val:
                ind = indices[X_new == i]
                if ind.shape[0] > 0:
                    child = construct_decision_tree(one_hot_encoding, rem_attr.copy(), numeric_cols, node, ind, X, Y, MAX_DEPTH)
                    child.value = i
                    node.child.append(child)
        return node

In [95]:
def decision_tree(one_hot_encoding, categorical_cols, numeric_cols, X, Y, MAX_DEPTH = 20):
    indices = np.arange(0,X.shape[0])
    dc_tree = construct_decision_tree(one_hot_encoding, categorical_cols.copy(), numeric_cols, None, indices, X, Y, MAX_DEPTH)
    return dc_tree

In [None]:
PART = 'a'
one_hot_encoding = True
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

print("Loading data...")
Xtrain, Ytrain, values_dict = load_data(one_hot_encoding, numeric_cols, train_path)

print("Loading test data...")
Xtest, Ytest = load_data(one_hot_encoding, numeric_cols, test_path, values_dict)

print("Loading val data...")
Xval, Yval = load_data(one_hot_encoding, numeric_cols, val_path, values_dict)

In [96]:
# if PART == 'a':
dc_tree = decision_tree(one_hot_encoding, categorical_cols, numeric_cols, Xtrain, Ytrain)

Attr: education ,  Gain:  -0.35969285154524766 ,  Depth: 0
Attr: default ,  Gain:  -0.36981781795613866 ,  Depth: 1
Attr: loan ,  Gain:  -0.37179719555310364 ,  Depth: 2
Attr: contact ,  Gain:  -0.5315503995751697 ,  Depth: 3
Attr: housing ,  Gain:  -0.4099269489923952 ,  Depth: 4
Attr: campaign ,  Gain:  -0.7013150052221249 ,  Depth: 5
Attr: campaign ,  Gain:  -0.7648638879166092 ,  Depth: 6
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 7
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 8
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 9
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 10
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 11
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 12
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 13
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 14
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 15
Attr: campaign ,  Gain:  -0.5865930247093258 ,  Depth: 16
Attr: campaign ,  Gain:  -0

In [97]:
def predict_recursive(one_hot_encoding, x, root, numeric_cols):
    if len(root.child) == 0:
        return root.decision
    else:
        val = ''
        if root.attr in numeric_cols:
            if x[root.attr] > root.median:
                val = 'right'
            else:
                val = 'left'
        elif one_hot_encoding == True:
            index = x[root.attr]
            if index % 2 == 0:
                val = 'right'
            else:
                val = 'left'
        else:            
            val = x[root.attr]
        
        for i in range(len(root.child)):
            if root.child[i].value == val:
                return predict_recursive(one_hot_encoding, x, root.child[i], numeric_cols)
        return root.decision
    
def predict(one_hot_encoding, X, root, numeric_cols):
    Y_pred = np.zeros(X.shape[0])
    for i in range(X.shape[0]):
    # for i in range(1):
        z = predict_recursive(one_hot_encoding, X.iloc[i], root, numeric_cols)
        Y_pred[i] = int(z)
    Y_pred = Y_pred.astype('int64')
    return Y_pred

In [98]:
def print_tree(root):
    if root.parent==None:
        print("Root Node. Feature Used to Split-> " + str(root.attr))
        print("Decision: " + str(root.decision))
        child_list = []
        for i in range(len(root.child)):
            child_list.append(root.child[i].value)
            
        print("Child Values -> " + str(child_list))

        for c in root.child:
            print_tree(c)
    else:
        print("Child splitted on feature -> " + str(root.parent.attr))
        print("Decision: " + str(root.decision))
        print("Value of child is -> " + str(root.value))
        # print("Feature to be used -> " + str(root.attr))
        child_list = []
        for i in range(len(root.child)):
            child_list.append(root.child[i].value)
            
        print("Child Values -> " + str(child_list))

        for c in root.child:
            print_tree(c)        

In [None]:
# print_tree(dc_tree)

In [99]:
def accuracy(y1,y2):
    count = 0.0
    for i in range(y1.shape[0]):
        if y1[i] == y2[i]:
            count+=1.0
    return (100.0*count)/(y1.shape[0])

In [100]:
y_pred = predict(one_hot_encoding, Xtrain, dc_tree, numeric_cols)
print("Accuracy: {}%".format(round(accuracy(y_pred,Ytrain),3)))

Accuracy: 91.15%


In [101]:
y_pred2 = predict(one_hot_encoding, Xtest, dc_tree, numeric_cols)
print("Accuracy: {}%".format(round(accuracy(y_pred2,Ytest),3)))

Accuracy: 86.662%


In [102]:
y_pred3 = predict(one_hot_encoding, Xval, dc_tree, numeric_cols)
print("Accuracy: {}%".format(round(accuracy(y_pred3,Yval),3)))

Accuracy: 87.461%


In [56]:
def bfs(root):
    i = 0
    node_list = [root]
    while i < len(node_list):
        top_node = node_list[i]
        if len(top_node.child) > 0:
            for c in top_node.child:
                node_list.append(c)
        i+=1
    return node_list
        

In [104]:
# PART B (POST PRUNING)

PART = 'b'
one_hot_encoding = True
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

print("Loading data...")
Xtrain, Ytrain, values_dict = load_data(one_hot_encoding, numeric_cols, train_path, {})
# Xtrain, Ytrain = load_data(one_hot_encoding, numeric_cols, train_path, {})
# values_dict = {}

print("Loading test data...")
Xtest, Ytest = load_data(one_hot_encoding, numeric_cols, test_path, values_dict)

print("Loading val data...")
Xval, Yval = load_data(one_hot_encoding, numeric_cols, val_path, values_dict)

Loading data...
Loading test data...
Loading val data...


In [120]:
dc_tree = decision_tree(one_hot_encoding, categorical_cols, numeric_cols, Xtrain, Ytrain, 10)

default -0.10934729882807671   Depth: 0
loan -0.11016854635468831   Depth: 1
poutcome -0.11784643220514694   Depth: 2
contact -0.12811406225705424   Depth: 3
housing -0.11813468947245419   Depth: 4
day -0.2074327874843914   Depth: 5
campaign -0.25931886115814284   Depth: 6
job -0.2835353709769059   Depth: 7
balance -0.2727843215170179   Depth: 8
month -0.23720701790454535   Depth: 9
marital -0.14537550864882468   Depth: 10
education -0.17024525786771055   Depth: 10
campaign -0.25644055294661583   Depth: 9
campaign -0.22650712137882847   Depth: 10
marital -0.10845590031244788   Depth: 10
education -0.33162995834925435   Depth: 8
marital -0.2634601002912583   Depth: 9
campaign -0.3165158014414191   Depth: 10
month -0.08435349194116182   Depth: 10
day -0.38826022412553524   Depth: 9
pdays -0.20554032201310324   Depth: 10
month -0.35226298211053464   Depth: 10
campaign -0.17897777962979783   Depth: 7
day -0.23953309320902536   Depth: 8
marital -0.18567933316093094   Depth: 9
day -0.3185192

In [121]:
node_list = bfs(dc_tree)
train_accuracy = []
test_accuracy = []
val_accuracy = []
node_count = [len(node_list)]

train_accuracy.append(accuracy_score(predict(one_hot_encoding, Xtrain, dc_tree, numeric_cols), Ytrain))
test_accuracy.append(accuracy_score(predict(one_hot_encoding, Xtest, dc_tree, numeric_cols), Ytest))
val_accuracy.append(accuracy_score(predict(one_hot_encoding, Xval, dc_tree, numeric_cols), Yval))


In [122]:
print(train_accuracy, test_accuracy, val_accuracy)
print(len(node_list))
# [0.8930269851802699] [0.8896261888962619] [0.8920831490490934]

[0.8954877239548772] [0.8854235788542358] [0.8911985846970367]
1704


In [123]:
iteration = 0
val_best_accuracy = 0.0

while True:
    iteration += 1
    previous_accuracy = val_accuracy[-1]
    after_accuracy    = None
    tree_best_node    = tree
    
    count = 0
    for node in node_list:
        count = count + 1
        if len(node.child) > 0:
            child_temp = node.child
            node.child = []
            after_accuracy = accuracy_score(predict(one_hot_encoding, Xval, dc_tree, numeric_cols), Yval)
            
            print("After: ",after_accuracy,"Val Best: ",val_best_accuracy,"Node number:",count)
            if after_accuracy > val_best_accuracy:
                val_best_accuracy = after_accuracy
                tree_best_node = node
            node.child = child_temp
    
    print("Best Accuracy: ",val_best_accuracy,"  Previous: ",previous_accuracy)
    if val_best_accuracy > previous_accuracy:
        tree_best_node.child = [] # PRUNING
    
        node_list = bfs(dc_tree)
        train_accuracy.append(accuracy_score(predict(one_hot_encoding, Xtrain, dc_tree, numeric_cols), Ytrain))
        test_accuracy.append(accuracy_score(predict(one_hot_encoding, Xtest, dc_tree, numeric_cols), Ytest))
        val_accuracy.append(accuracy_score(predict(one_hot_encoding, Xval, dc_tree, numeric_cols), Yval))
        node_count.append(len(node_list))

        print("Iteration: {}, Val_accuracy: {}, Node_count: {}".format(iteration, val_accuracy[-1], node_count[-1]))        
    else:
        break

After:  0.8861123396727112 Val Best:  0.0 Node number: 1
After:  0.885891198584697 Val Best:  0.8861123396727112 Node number: 2
After:  0.8914197257850509 Val Best:  0.8861123396727112 Node number: 3
After:  0.8861123396727112 Val Best:  0.8914197257850509 Node number: 4
After:  0.8909774436090225 Val Best:  0.8914197257850509 Node number: 5
After:  0.8914197257850509 Val Best:  0.8914197257850509 Node number: 6
After:  0.8911985846970367 Val Best:  0.8914197257850509 Node number: 7
After:  0.8911985846970367 Val Best:  0.8914197257850509 Node number: 8
After:  0.8861123396727112 Val Best:  0.8914197257850509 Node number: 9
After:  0.8911985846970367 Val Best:  0.8914197257850509 Node number: 10
After:  0.8909774436090225 Val Best:  0.8914197257850509 Node number: 11
After:  0.8911985846970367 Val Best:  0.8914197257850509 Node number: 12
After:  0.8914197257850509 Val Best:  0.8914197257850509 Node number: 13
After:  0.8911985846970367 Val Best:  0.8914197257850509 Node number: 14
Aft

KeyboardInterrupt: 

In [124]:
print(node_count)
print("Train: ", train_accuracy)
print("Test: ", test_accuracy)
print("Val: ", val_accuracy)

[1704, 1595, 1294, 1288]
Train:  [0.8954877239548772, 0.8947135589471356, 0.8945753151957532, 0.8944370714443707]
Test:  [0.8854235788542358, 0.8885202388852024, 0.8885202388852024, 0.8891838088918381]
Val:  [0.8911985846970367, 0.8931888544891641, 0.8934099955771783, 0.8936311366651923]


In [126]:
# Part C Random Forest
def load_data_rf(numeric_cols, filename, values_dict = {}):
    df = pd.read_csv(filename, delimiter = ';')
    Y = df['y'].copy()
    Y  = Y.to_numpy()
    for i in range(Y.shape[0]):
        if Y[i] == 'yes':
            Y[i] = 1
        else:
            Y[i] = 0 #Assigning 0 to nan values
    
    Y = Y.astype('int64')
    df = df.drop(['y'],axis=1)
                
    if values_dict == {}:
        for col in df.columns:
            if col not in numeric_cols:
                values = list(set(list(df[col])))
                values_dict[col] = values                
                for i in range(df.shape[0]):
                    temp = df[col][i]
                    df[col][i] = np.zeros(len(values))
                    df[col][i][values_dict[col].index(temp)] = 1
        return df, Y, values_dict
    
    else:
        for col in df.columns:
            if col not in numeric_cols:
                for i in range(df.shape[0]):
                    temp = df[col][i]
                    df[col][i] = np.zeros(len(values_dict[col]))
                    if temp in values_dict[col]:
                        df[col][i][values_dict[col].index(temp)] = 1                        
        return df, Y    

In [127]:
PART = 'c'
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

print("Loading train data...")
Xtrain_rf, Ytrain_rf, values_dict_rf = load_data_rf(numeric_cols, train_path, {})

print("Loading test data...")
Xtest_rf, Ytest_rf = load_data_rf(numeric_cols, test_path, values_dict_rf)

print("Loading val data...")
Xval_rf, Yval_rf = load_data_rf(numeric_cols, val_path, values_dict_rf)

Loading train data...
Loading test data...


TypeError: load_data_rf() takes from 2 to 3 positional arguments but 4 were given

In [129]:
best_params = {
    "num_estimators": 14,
    "bootstrap": True,
    "num_features": 52,
    "accuracy": -1
}

n_estimators = [50, 150, 250, 350, 450]
max_features = [0.1, 0.3, 0.5, 0.7, 0.9]
min_samples_split = [2, 4, 6, 8, 10]

bootstrap_list = [True, False]
max_features_list = np.arange(1,Xtrain_rf.shape[1])

In [None]:
for n in n_estimators:
    for bs in bootstrap_list:
        for m in max_features_list:
            rf = RandomForestClassifier(n_estimators=n, criterion="entropy", bootstrap=bs, max_features=m, min_samples_split=10)
            # rf.fit(Xtrain_rf, Ytrain_rf.reshape(-1))
            rf.fit(Xtrain, Ytrain.reshape(-1))
            # acc = accuracy_score(Yval_rf, np.array(rf.predict(Xval_rf), dtype=int))
            acc = accuracy_score(Yval, np.array(rf.predict(Xval), dtype=int))
            if acc > best_params['accuracy']:
                print("Accuracy: ", acc)                
                best_params['num_estimators'] = n
                best_params['bootstrap'] = bs
                best_params['num_features'] = m
                best_params['accuracy'] = acc
                break

print(best_params)