In [7]:
import pandas as pd
import math

def closest(lst, K): 
    return lst[min(range(len(lst)), key = lambda i: abs(lst[i]-K))]

class node:
    def __init__(self, columnValues, Label):
        self.attribute = None
        self.branch = {}
        self.value = 0
        self.end = False
        self.columnValues = columnValues
        self.Label = Label

    def getIG(self,table, col, current_entropy):
        red = 0
        for val in self.columnValues[col]:
            temp = table[table[col] == val]
            e = self.entropy(temp)
            temp_rows = temp[self.Label].count()
            total_rows = table[self.Label].count()
            e = (temp_rows*e)/total_rows
            red += e
        return current_entropy - red
      
    def entropy(self,table):
        c = table[self.Label].value_counts()
        a = 0
        b = 0
        if 1 in c.keys():
            a = c[1]
        if 0 in c.keys():
            b = c[0]
        total = a+b
        val_a = 0
        val_b = 0
        if a>0:
            P_a = a/total
            val_a = P_a*math.log(P_a, 2.0)
        if b>0:
            P_b = b/total
            val_b = P_b*math.log(P_b, 2.0)
        return -(val_a + val_b)
        
    def train(self, table):
        if len(table.columns)==1:
            c = table[self.Label].value_counts()
            a = 0
            b = 0
            if 1 in c.keys():
                a = c[1]
            if 0 in c.keys():
                b = c[0]
            if a>0:
                self.value = a/(a+b)
            else:
                self.value = 0.0
            self.end = True
            return
        
        current_entropy = self.entropy(table)
        if current_entropy == 0.0:
            c = table[self.Label].value_counts()
            a = 0
            b = 0
            if 1 in c.keys():
                a = c[1]
            if 0 in c.keys():
                b = c[0]
            if a>0:
                self.value = a/(a+b)
            else:
                self.value = 0.0
            self.end = True
            return
        
        possible_attributes = {}
        for column in table.columns:
            if column == self.Label:
                continue
            possible_attributes[column] = self.getIG(table, column, current_entropy)
        selected_column = max(possible_attributes, key=possible_attributes.get)
        self.attribute = selected_column

        for value in self.columnValues[selected_column]:
            temp = table[table[selected_column] == value]
            temp = temp.loc[:, temp.columns != selected_column]
            self.branch[value] = node(self.columnValues, self.Label)
            self.branch[value].train(temp)

    def predict(self,row):
        if self.end:
            return self.value

        else:
            direction = row[self.attribute]
            direction = closest(list(self.branch.keys()), direction)
            return self.branch[direction].predict(row)

In [8]:
import pandas as pd
import math

class tree:
    def __init__(self, df, Label):
        self.df = df
        self.Label = Label
        # self.clean()
        self.columnValues = {}
        for col in df.columns:
            self.columnValues[col] = []
            for val in df[col].unique():
                self.columnValues[col].append(val)
        self.head = node(self.columnValues, self.Label)
        self.train()


    def train(self):
        self.head.train(self.df)

    def predict(self, test):
        return self.head.predict(test)

In [10]:
%matplotlib tk

import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from datetime import datetime
import numpy as np
import pandas as pd
import random
import statistics 
random.seed(1)


startTime = datetime.now()
label = 'Selector'

def normalize(x):
    result = x.copy()
    for feature_name in x.columns:
        max_value = x[feature_name].max()
        min_value = x[feature_name].min()
        if feature_name != label:
            result[feature_name] = (((((x[feature_name] - min_value) / (max_value - min_value))//0.05)*0.05)+0.05).round(3)
    return result

features = ['Age','Gender','TB','DB','Alkphos','Sgpt','Sgot','TP','ALB','A/G']

# reading in the csv as a dataframe
df = pd.read_csv('ILPD.csv')
df = df[features + [label]]
df = normalize(df)

# selecting the features and target
X = df[features]
y = df[label]

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import train_test_split



train ,test = train_test_split(df, test_size = 0.25)
train1 ,test1 = train_test_split(train, test_size = 0.9)
train2 ,test2 = train_test_split(train, test_size = 0.9)
train3 ,test3 = train_test_split(train, test_size = 0.9)
# print(df)

def test_train(tree, row,threshold = 0.5):
    output = tree.predict(row)
    val = output
    if val<threshold:
        return 0
    else:
        return 1

def testing(tree, train, test, threshold = 0.5):
    count_train = 0
    count_test = 0
    total_train = 0
    total_test = 0
    for _,row in train.iterrows():
        total_train += 1
        output = tree.predict(row)
        val = output

        if val<threshold:
            val=0
        else:
            val=1
        if val != row[label]:
            count_train += 1
    for _,row in test.iterrows():
        total_test += 1
        output = tree.predict(row)
        val = output

        if val<threshold:
            val=0
        else:
            val=1
        if val != row[label]:
            count_test += 1
    error_rate_test = count_test/total_test
    error_rate_train = count_train/total_train
    error_rate = (count_train+count_test)/(total_test+total_train)
    return [error_rate, error_rate_train, error_rate_test]


# training
trees = []
featureSet1 = ['Gender','DB','Sgpt','TP','A/G']
t1 = train1[featureSet1+[label]]
treepointer1 = tree(t1, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer1,train1,test1)
print("1. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer1)

featureSet2 = ['Sgot','TP','ALB','A/G']
t2 = train2[featureSet2+[label]]
treepointer2 = tree(t2, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer2,train2,test2)
print("2. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer2)

featureSet3 = ['Age','TB','Sgpt','A/G']
t3 = train3[featureSet3+[label]]
treepointer3 = tree(t3, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer3,train3,test3)
print("3. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer3)

featureSet4 = ['Gender','Sgpt','Sgot','TP','ALB']
t4 = train1[featureSet4+[label]]
treepointer4 = tree(t4, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer4,train1,test1)
print("4. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer4)

featureSet5 = ['Age','TB','DB','A/G']
t5 = train2[featureSet5+[label]]
treepointer5 = tree(t5, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer5,train2,test2)
print("5. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer5)

featureSet6 = ['TB','DB','Alkphos','TP']
t6 = train3[featureSet6+[label]]
treepointer6 = tree(t6, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer6,train3,test3)
print("6. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer6)

featureSet7 = ['Age','Alkphos','Sgpt','Sgot','TP']
t7 = train1[featureSet7+[label]]
treepointer7 = tree(t7, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer7,train1,test1)
print("7. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer7)

featureSet8 = ['Age','Sgpt','Sgot','A/G']
t8 = train2[featureSet8+[label]]
treepointer8 = tree(t8, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer8,train2,test2)
print("8. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer8)

featureSet9 = ['Age','TB','Alkphos','Sgot','ALB']
t9 = train3[featureSet9+[label]]
treepointer9 = tree(t9, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer9,train3,test3)
print("9. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer9)

featureSet10 = ['Age','Gender','TB','DB','Alkphos']
t10 = train2[featureSet10+[label]]
treepointer10 = tree(t10, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer10,train2,test2)
print("10. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer10)

featureSet11 = ['Sgpt','Sgot','TP','ALB','A/G']
t11 = train3[featureSet11+[label]]
treepointer11 = tree(t11, label)
error_rate, error_rate_train, error_rate_test = testing(treepointer11,train3,test3)
print("11. error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")
trees.append(treepointer11)

final_count_train = 0
final_total_train = 0
final_error_train = 0
final_count_test = 0
final_total_test = 0
final_error_test = 0

y_pred = []
y_test = np.array(test[label])


# Checking on Final Training Set
for index,row in train.iterrows():
    final_total_train += 1
    output = [0]*len(trees)
    for i in range(len(trees)):
        output[i] = test_train(trees[i], row)
    val = statistics.mode(output)
    if val == row[label]:
        pass
    else:
        final_count_train += 1

# Checking on Final Testing Set
for index,row in test.iterrows():
    final_total_test += 1
    output = [0]*len(trees)
    for i in range(len(trees)):
        output[i] = test_train(trees[i], row)
    val = statistics.mode(output)
    y_pred.append(val)
    if val == row[label]:
        pass
    else:
        final_count_test += 1
        
error_rate_test = final_count_test/final_total_test
error_rate_train = final_count_train/final_total_train
error_rate = (final_count_test+final_count_train)/(final_total_test+final_total_train)
print("\nfinal error :", "{:5.4f}".format(error_rate), "\ttrain error :", "{:5.4f}".format(error_rate_train), "\ttest error :", "{:5.4f}".format(error_rate_test), "\n\n")

# Plotting the confusion matrix
cm = confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm,show_absolute=True,colorbar=True)
plt.show()



# printing execution time of script
print("\n")
print("Execution time in seconds = ", datetime.now() - startTime)

1. error : 0.2196 	train error : 0.0248 	test error : 0.2412 


2. error : 0.2064 	train error : 0.0331 	test error : 0.2256 


3. error : 0.1804 	train error : 0.0372 	test error : 0.1962 


4. error : 0.2047 	train error : 0.0207 	test error : 0.2251 


5. error : 0.2225 	train error : 0.0661 	test error : 0.2398 


6. error : 0.2150 	train error : 0.0909 	test error : 0.2288 


7. error : 0.1804 	train error : 0.0041 	test error : 0.1999 


8. error : 0.1985 	train error : 0.0331 	test error : 0.2169 


9. error : 0.1531 	train error : 0.0165 	test error : 0.1683 


10. error : 0.1692 	train error : 0.0372 	test error : 0.1839 


11. error : 0.1465 	train error : 0.0248 	test error : 0.1600 



final error : 0.0792 	train error : 0.0747 	test error : 0.0928 




Execution time in seconds =  0:01:03.569129
