In [1]:
import numpy as np # linear algebra
import pandas as pd
from collections import Counter
from statistics import mode

In [2]:
data=[["Yes","Yes",7,"No"],
      ["Yes","No",12,"No"],
      ["No","Yes",18,"Yes"],
      ["No","Yes",35,"Yes"],
      ["Yes","Yes",38,"Yes"],
      ["Yes","No",50,"No",],
      ["No","No",83,"No"]]
data_df=pd.DataFrame(data, columns=["Loves Popcorn", "Loves Soda", "Age", "Loves Cool as Ice"])
target="Loves Cool as Ice"

In [3]:
df_cat=data_df.select_dtypes(include=np.object_).columns
df_num=data_df.select_dtypes(include=np.number).columns
print("Categorical:", df_cat)
print("Continuous:", df_num)

Categorical: Index(['Loves Popcorn', 'Loves Soda', 'Loves Cool as Ice'], dtype='object')
Continuous: Index(['Age'], dtype='object')


In [4]:
def calc_gini_impurity(leaf):
    gi=0
    #get the total elements in the leaf
    total=len(leaf)
    if (total>0):
        grp=dict(Counter(leaf))
        s=0
        for k,v in grp.items():
            s= s + (v/total)**2 
        gi= 1-s
    return gi   

def calc_weighted_gini(left_leaf, right_leaf, total):
    left_gini_imp=calc_gini_impurity(left_leaf)
    right_gini_imp=calc_gini_impurity(right_leaf)
    weighted_gini= ((len(left_leaf)*left_gini_imp) + (len(right_leaf)*right_gini_imp))/total
    return weighted_gini

In [5]:
def determine_gini_cat(var,df):
    #determine the possible values in the col. That will be the possible ways to split the column
    col=df[var]
    threshold=list(set(col))
    best_split=""
    min_gini=100000000000
    for t in threshold:
        left_leaf=df[df[var] == t][target]
        right_leaf=df[df[var]!=t][target]
        gini_gain=calc_weighted_gini(left_leaf,right_leaf,len(df[var]))
        if (gini_gain < min_gini):
            min_gini=gini_gain
            best_split=t
    return best_split,min_gini    

def determine_gini_num(var,df):
    #determine the possible values in the col. That will be the possible ways to split the column
    col=df[var]
    col=col.sort_values()
    threshold=col.rolling(2).mean()
    threshold=threshold[1:] #drop the NaNs
    best_split=""
    min_gini=100000000000
    for t in threshold:
        left_leaf=df[df[var] <= t][target]
        right_leaf=df[df[var] > t][target]
        gini_gain=calc_weighted_gini(left_leaf,right_leaf,len(df[var]))
        if (gini_gain < min_gini):
            min_gini=gini_gain
            best_split=t
    return best_split,min_gini   

def determine_gini_var(var, df):
    #determine the possible values in the col. That will be the possible ways to split the column
    if var in df_cat:
        return (determine_gini_cat(var,df))
    else:
        return (determine_gini_num(var,df))     
        

In [6]:
def get_node_cat(df,var,threshold):
    left_node=df[df[var]== threshold]
    right_node=df[df[var] != threshold]
    return left_node,right_node

def get_node_num(df,var,threshold):
    left_node=df[df[var] <= threshold]
    right_node=df[df[var] > threshold]
    return left_node,right_node

def get_node(df,var,threshold):
    if var in df_cat:
        return get_node_cat(df,var,threshold)
    else:
        return get_node_num(df,var,threshold)

In [7]:
def is_same(l):
    return all(i==l[0] for i in l)

In [8]:
def determine_node_dir(var,threshold):
    is_left=False
    if var in df_cat:
        if (df_predict.at[0,var] == threshold):
            is_left=True
    else:
        if (df_predict.at[0,var] <=threshold):
            is_left = True
    return is_left                

In [9]:
def best_split(df):
    features=[c for c in df.columns.to_list() if c != target]
    split_list_feature=[]
    split_list_threshold=[]
    split_list_gini=[]
    for f in features:
        split,gini=determine_gini_var(f,df)
        split_list_feature.append(f)
        split_list_threshold.append(split)
        split_list_gini.append(gini)

    index=[i for i,v in enumerate(split_list_gini) if v== min(split_list_gini)][0]  
    return split_list_feature[index], split_list_threshold[index],split_list_gini[index] 

In [10]:
def create_leave(df):
    return tree_classifier(mode(df[target]))

In [11]:
def create_node(t,df):
    if t is None:
        if (len(df) <3):
            return create_leave(df)
            #return tree_classifier(mode(df[target]))
        else:
            return tree_classifier(best_split(df))
    else:
        var=t.root[0]
        threshold=t.root[1]
        gini=t.root[2]
        left_node, right_node= get_node(df,var,threshold)
        if ((len(left_node) <3) or gini == 0.000 or is_same(left_node[target].to_list())):
            t.left=create_leave(left_node)
        else:
            t.left=create_node(t.left,left_node)
            if (len(t.left.root) ==3):
                t.left=create_node(t.left,left_node)
            else:
                t.left=create_leave(left_node)
        if ((len(right_node) <3) or gini == 0.000 or is_same(right_node[target].to_list())):
            t.right=create_leave(right_node)
        else:
            t.right=create_node(t.right, right_node)
            if (len(t.right.root) ==3):
                t.right=create_node(t.right,right_node)
            else:
                t.right=create_leave(right_node)
        return t
                                  

In [12]:
class tree_classifier():
    def __init__(self,val=[]):
        self.root=val
        self.left=None
        self.right=None
    
    def print_tree(self):
        if self.root is not None:
            print(self.root)
        if self.left is not None:
            self.left.print_tree() 
        if self.right is not None:
            self.right.print_tree()
            
    def predict(self,df_predict):
        if (self.left is not None or self.right is not None):
            var=self.root[0]
            threshold=self.root[1]
            if (determine_node_dir(var,threshold) == True):
                #left node
                if self.left is not None:
                    return(self.left.predict(df_predict))
                else:
                    return self.root            
            else:
                if self.right is not None:
                    return (self.right.predict(df_predict))
                else:
                    return self.root
        elif (self.left is None and self.right is None):
            return self.root
        else:
            return        
    
    def fit(self,df):
        self=create_node(self,df)

In [13]:
t=tree_classifier(best_split(data_df))
#t=create_node(t,data_df)
t.fit(data_df)
t.print_tree()

('Loves Soda', 'No', 0.21428571428571427)
No
('Age', 12.5, 0.0)
No
Yes


In [14]:
df_predict=pd.DataFrame(columns=data_df.columns.to_list())
df_predict["Age"]= [15]
df_predict["Loves Soda"] = ["Yes"]
df_predict["Loves Popcorn"] = ["No"]
df_predict.head(1)
df_predict.at[0,"Loves Soda"]
df_predict["Loves Cool as Ice"]=[t.predict(df_predict)]
df_predict.head(1)

Unnamed: 0,Loves Popcorn,Loves Soda,Age,Loves Cool as Ice
0,No,Yes,15,Yes
