<a href="https://colab.research.google.com/github/pranscript/ml_coursework/blob/master/Homework_4_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from functools import reduce
import io
from sklearn.metrics import accuracy_score 
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score 
warnings.simplefilter(action = 'ignore')

In [0]:
class Node:    
    # Constructor
    def __init__(self, col=None, dec=None):
        # Either col or dec should be None (Internal node or leaf)
        self.name = col
        self.decision = dec
        self.branches = []
        self.branch_names = []
        
    # This function is used to do post-pruning of the trained decision tree
    # The method is not implemented in this version
    def prune():
        pass

    # This function is used to predict the test dataset.
    def predict(self, df):
        prediction = []
        #print(df)
        for ind, row in df.iterrows():
            result = self.row_predict(row)
            prediction.append(result)
            
        return pd.Series(prediction)        
    
    def row_predict(self, row):
        pred = ""
        if self.name != None:
            att = self.name
            att_val = row[att]
            att_ind = self.branch_names.index(att_val)
            att_br_tree = self.branches[att_ind]

            pred = att_br_tree.row_predict(row)            
        else:
            return self.decision
        
        return pred
                

    # The below function is to print the decision tree. 
    # It prints internal node/leaf with the information about the level it belongs.
    def print(self, name="Root", lev=0):       
        if self.name != None:
            print(lev*"   " + "-(" + name + ")-> " + "N: ", self.name)           
        
            for i in range(len(self.branches)):
                n = self.branch_names[i]
                b = self.branches[i]
                b.print(n, lev+1)
        else:
            print(lev*"   " + "-(" + name + ")-> " + "L: ", self.decision)
    
# This function calculates entropy of a column w.r.t. label (y)
def entropy(label):
    num_unique = label.nunique() 
    entropy_label = 0
    
    for i in range(num_unique):
        e_vc = label.value_counts()[i]/len(label)
        entropy_label += -1 * e_vc * math.log2(e_vc)

    return entropy_label

# This function calculates the gain ratio for branching on the column
def gain_ratio(current_ent, col, df):
    information = 0
    vc = df[col].value_counts()
        
    for i in range(len(vc)):
        p = vc[i]/len(df)  # probality of a particular class in the attribute
        # Entropy of the column with the particular class in the attribute
        s = entropy(df[df[col] == vc.index[i]].iloc[:,0])
        temp_ent = p * s
        information += temp_ent
        
    gain = current_ent - information
    
    # Computing the intrinsic value
    iv = 0    
    for i in range(len(vc)):
        iv += -1 * (vc[i]/len(df)) * math.log2(vc[i]/len(df))
    
    # Gain ratio - gain/intrinsic value 
    if iv != 0:
        gain_ratio = gain/iv
    else:
        gain_ratio = 0        
    
    return round(gain_ratio, 3)


# This assumes that the the first column in df input is the y label
# algorithm = 'id3' for information gain and algorithm = 'c4.5' for gain ratio
def fit_dt(df, algorithm):
    algo = algorithm
    columns = df.columns[1:]  # 0th column is the y label    
    current_ent = entropy(df.iloc[:,0]) # Obtaining the current entropy before spliting it.
    
    # If no features are left to branch or current_entropy is 0 (or nearly 0) then make a leaf
    # Entropy value of 0 means y label is perfectly one sided.
    if ((len(columns) == 0) | (current_ent < 0.01)):
        dec = df.iloc[:,0].value_counts().idxmax()
        return Node(None, dec)
    
    information_gain = [] # This would finally contain all the info gain from all the columns of the dataframe.
    
    for col in columns:
        if algo == 'c4.5':
            ig = gain_ratio(current_ent, col, df)
            
        information_gain.append(ig)
    
    ind = np.argmax(information_gain)
    
    # This is the internal node of the tree
    tree_node = df.columns[ind+1]
    
    # The Node class object holds internal tree nodes that would be branching.
    # fit_dt() function is called recursively to construct/train the Decision Tree.
    #print(tree_node)
    i_node = Node(tree_node, None)
    branch_val = df[tree_node].unique()
    #print(branch_val)
    for brn in branch_val:        
        a = list(df.columns)
        b = tree_node        
        sub_col_list = [v for v in a if v != b]
        # Recursive calling of fit_dt() 
        sub_tree = fit_dt(df[df[tree_node] == brn][sub_col_list], algorithm = algo)
        # sub_tree = fit_dt(df_temp, algorithm = algo)     
        i_node.branches.append(sub_tree)
        i_node.branch_names.append(brn)
    
    return i_node

In [62]:
from google.colab import files
train = files.upload()

Saving trainc45.data to trainc45.data


In [0]:
train_df1 = pd.read_csv(io.BytesIO(train['trainc45.data']),header = None)
train_df_temp1 = train_df1.copy()
print (train_df1) 

In [75]:
from google.colab import files
train2 = files.upload()

Saving trainc452.data to trainc452.data


In [0]:
train_df2 = pd.read_csv(io.BytesIO(train2['trainc452.data']),header = None)
train_df_temp2 = train_df2.copy()
print (train_df2) 

In [143]:
test = files.upload()

Saving testc45.data to testc45.data


In [0]:
test_df = pd.read_csv(io.BytesIO(test['testc45.data']),header = None)
test_df_temp = test_df.copy()
print (test_df) 

In [0]:
train_list = [train_df, train_df2]


In [191]:
def validation(t_list):    
    c45_max_score = 0
    c45_best_tree = None    
    
    n = len(t_list)    
    for i in range(n):
        val = t_list[i]  # The validation dataset
        del t_list[i]
        
        train = reduce(lambda x,y: pd.concat([x,y]), t_list) 
        c45_temp_tree = fit_dt(train, algorithm = 'c4.5') # Training the decision tree with the training data
        
        c45_prediction = c45_temp_tree.predict(val) # Predicting on the validation dataset using the recent trained decision tree
        c45_score = met.f1_score(val[0].values, c45_prediction.values, pos_label = 'Win') # F1 score computation  

        if c45_score >= c45_max_score:
            c45_max_score = c45_score
            c45_best_tree = c45_temp_tree
    
        train_list.insert(i, val) # Inserting back the validation set in training list for next round.

    #print("Best id3 validation score is: ", id3_max_score)
    print("Best c4.53 validation score is: ", c45_max_score)
    
    return  c45_best_tree

c45_tree = validation(train_list)

Best c4.53 validation score is:  0.6666666666666666


In [0]:
Y_test = test_df.values[:, 0:1] 

In [196]:
print("Accuracy:",accuracy_score(Y_test, c45_prediction))

Accuracy: 0.9090909090909091


In [197]:
print("Precision:", precision_score(Y_test, c45_prediction, pos_label='Win'))

Precision: 1.0


In [198]:
print("Recall:", recall_score(Y_test, c45_prediction, pos_label='Win'))

Recall: 0.875


In [199]:
print("F1:", f1_score(Y_test, c45_prediction, pos_label='Win'))

F1: 0.9333333333333333
