In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
from numpy import mean
import pandas as pd
from numpy import log, dot, e

In [2]:
#!pip install matplotlib
#!pip install sklearn

## Titantic dataset

In [3]:
Xx = pd.read_csv('titanic.csv',nrows=200)
X = Xx[['Pclass','Sex','Siblings/Spouses Aboard','Parents/Children Aboard','Survived']]
X = pd.get_dummies(X)
X['target'] = X['Survived']
X = X.drop(['Survived'],axis=1)

In [4]:
#dataset columns
header = X.columns

## for user generated dataset

In [6]:
def create_random_data(hm_features,obs):

        X = np.ones((obs))

        for i in range(hm_features):
            xs = np.random.rand(obs)*100
            X = np.append(X,xs)

        X = np.reshape(X, (-1,obs)).transpose()
        ys = np.random.randint(0,2,size=obs) # 2 high as it's exclusive

        return X,ys    

In [7]:
s,t = create_random_data(4,10)

## Useful functions

In [24]:
def class_counts(rows):
    """Counts the number of each type of example in a dataset.
    Note - this function expect last column as target variable
    Output - {0:12,1:43} 12 0's and 43 1's
    """
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [29]:
def is_numeric_(val):
    '''
    check if given value is a integer or float
    '''
    return isinstance(val,int) or isinstance(val,float)

In [30]:
#Demo
is_numeric_(2)
#True

is_numeric_("red")
#False

False

In [191]:
class question():
    '''
    This is the first step in building the decision tree
    Build a question i.e. Given a column and value, what would be
    the question?
    
    question(1,3)
    >>is Siblings/Spouses Aboard >= 3 
    
    question(2,0)
    >>is Parents/Children Aboard >= 0 
    
    '''
    def __init__(self,col,val):
        self.col = col
        self.val = val
    
    def __repr__(self):
        if is_numeric_(self.val):
            return  "is %s >= %s " % (header[self.col],self.val)
        else: return "is %s == %s " % (header[self.col],self.val)
    
    def match(self,df):
        '''
        compare feature value in df to threshold value we have created 
        '''
        v = df[self.col]
        if is_numeric_(v):
            return v >= self.val
        else:
            return v == self.val

In [34]:
q.match((X[:1].values)[0])

True

In [35]:
(X[:1].values)[0]

array([3, 1, 0, 0, 1, 0], dtype=int64)

In [36]:
def partition_my_data(df,ask_question):
    true_rows, false_rows = [] , []
    for r in df:
        if ask_question.match(r):
            true_rows.append(r)
        else:
            false_rows.append(r)
            
#     true_rows_y = target[true_rows.index]
#     false_rows_y = target[false_rows.index]
    return true_rows,false_rows#true_rows_y,
#             false_rows#,false_rows_y

In [37]:
#tr, fr = partition_my_data(X.to_numpy(),question(2,0))
tr, fr = partition_my_data(X.to_numpy(),question(2,0))

In [38]:
len(tr),len(fr)

(148, 52)

In [86]:
def cal_gini(df):
    impurity = 1
    counts = class_counts(df) #hard coded
    
    
    #print(counts)
    for c in counts:
        #print(impurity,c)
        prob_of_c = counts[c] / float(len(df))
        #print(prob_of_c)
        impurity -= (prob_of_c)**2
        
    return impurity

In [89]:
def info_gain(l,r,current_info):
    p = float( len(l)/ (len(l)+len(r)))
    #rint(p)
    return current_info - (p * cal_gini(l)) - ((1-p) * cal_gini(r))

In [90]:
curr_i = cal_gini(X); print(curr_i)

0.9997499999999999


In [91]:
#tr, fr = partition_my_data(X.to_numpy(),question(2,0))
tr, fr = partition_my_data(X.to_numpy(),question(2,0))

In [92]:
len(tr),len(fr)

(148, 52)

In [93]:
info =info_gain(fr,tr,curr_i); print(info)

0.5480919958419957


In [94]:
cal_gini(tr)

0.4474068663257852

In [95]:
def best_split(x):
    '''
    finding the best question to ask i.e. best feature and best threshold
    calculate the IV
    '''
    best_gain = 0 #to track the IV values
    best_question = None #best question based on feature and threshold
    currrent_uncertainity = cal_gini(x)
    n_feature = len(x[0]) - 1 # -1 for target feature
    
    for col in range(n_feature): # for each feature
        u_val = set([xi[col] for xi in x]) # unique values of features
        
        for u_v in u_val:
            my_question = question(col,u_v) #create condition
            
            true_rows, false_rows = partition_my_data(x,my_question) #split data based on question
            
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            
            my_gain = info_gain(true_rows,false_rows,currrent_uncertainity) # find IV
            
            if my_gain > best_gain:
                best_gain, best_question = my_gain, my_question
                
    return best_gain,best_question    

In [96]:
best_split(X.to_numpy())

(0.15340430723878157, is Sex_female == 0 )

In [97]:
class leaf():
    '''
    leaf node hold the original target value distribution
    {0:12,1:50} Means there are 12 0's and 50 1's in training data
    '''
    
    def __init__(self,rows):
        self.predictions = class_counts(rows)    

In [174]:
class Decision_Node:
    """A Decision Node asks a question.

    This holds a reference to the question, and to the two child nodes.
    """

    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [181]:
def build_tree(x):
    '''
    rules of recursion: 1) believe that it works 2) start by checking base case
    3) prepare for giant case
    '''
    #print(x)
    # find best question and return it's IV
    gain, question = best_split(x)
    
    #print("1,",gain,question)
    #base case when no split possible
    if gain == 0:
        #print(x)
        #print("leave")
        #print(leaf(x).predictions)
        return leaf(x)
    #print("2. yes")
    #partition data on best question
    true_rows, false_rows = partition_my_data(x,question)
    
    #print(len(true_rows),len(false_rows))
    #recursively build tree
    true_branch = build_tree(true_rows)
    
    #recursively build tree
    false_branch = build_tree(false_rows)
    
    return Decision_Node(question,true_branch,false_branch)

In [188]:
def print_tree(node, sp = ""):
    if isinstance(node, leaf):
        print(sp,"predict",node.predictions)
        return 
    
    #question at node
    print(sp,str(node.question))
    
    #print true branch recursively
    print(sp,"--> True")
    print_tree(node.true_branch,sp+" ")
    
    #print true branch recursively
    print(sp,"--> False")
    print_tree(node.false_branch,sp+" ")
    

In [189]:
my_tree = build_tree(X.to_numpy())

In [190]:
print_tree(my_tree)

 is Sex_female == 0 
 --> True
  is Pclass == 2 
  --> True
   is Parents/Children Aboard == 0 
   --> True
    is Siblings/Spouses Aboard == 0 
    --> True
     predict {1: 2, 0: 12}
    --> False
     predict {0: 4}
   --> False
    is Siblings/Spouses Aboard == 2 
    --> True
     predict {1: 1}
    --> False
     predict {1: 2, 0: 2}
  --> False
   is Siblings/Spouses Aboard == 0 
   --> True
    is Parents/Children Aboard == 2 
    --> True
     predict {0: 1, 1: 1}
    --> False
     is Pclass == 3 
     --> True
      is Parents/Children Aboard == 0 
      --> True
       predict {0: 47, 1: 6}
      --> False
       predict {0: 2}
     --> False
      is Parents/Children Aboard == 0 
      --> True
       predict {0: 11, 1: 3}
      --> False
       predict {0: 5, 1: 1}
   --> False
    is Siblings/Spouses Aboard == 1 
    --> True
     is Parents/Children Aboard == 1 
     --> True
      predict {1: 1, 0: 2}
     --> False
      is Pclass == 1 
      --> True
       predict {