In [11]:
import pandas as pd
import numpy as np

# Decision Tree

Our dataset contains 10 people that were passengers on the Mini-Titanic. We know their age and their sex and we want to predict with a Decision Tree if they survived or not.

In [2]:
age = [
    12,
    10,
    11,
    9,
    8,
    45,
    63,
    72,
    55,
    66
]

In [3]:
# 1 -> Male
# 2 -> Female

sex = [
    1,
    1,
    1,
    0,
    0,
    1,
    1,
    1,
    0,
    0
]

In [4]:
# 1 -> Yes
# 0 -> No

survived = [
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    1,
    1
]

In [9]:
data = pd.DataFrame({'age':age,'sex':sex,'survived':survived})

In [10]:
data

Unnamed: 0,age,sex,survived
0,12,1,0
1,10,1,0
2,11,1,0
3,9,0,0
4,8,0,0
5,45,1,0
6,63,1,0
7,72,1,0
8,55,0,1
9,66,0,1


# CART Algorithm

Classification and Regression Trees by Breiman et al (1984). Its a binary dicision tree that is constructed by splitting a node into two child nodes repeatedly, beginning with the root node that contains the whole learning sample.

## Tree Growing Process

Choose a split among all the possible splits at each node so that the resulting child nodes are the purest.

In [12]:
def mse(y, y_hat): 1/len(y)*np.sum((y-y_hat)**2)

In [148]:
X = np.array([data['age'], data['sex']])
y = np.array(data['survived'])

In [149]:
X

array([[12, 10, 11,  9,  8, 45, 63, 72, 55, 66],
       [ 1,  1,  1,  0,  0,  1,  1,  1,  0,  0]])

In [150]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1])

In [86]:
def loss(X,y):
    l = 0
    for i in range(len(y)):
        y_hat = dt(X[0][i], X[1][i])
        l += (y_hat-y[i])**2
    return 1/len(y)*l

In [93]:
def dt(age, sex):
    if sex == 0:
        if age > 45: return 1
        else       : return 0
    else: return 0

In [94]:
loss(X,y)

0.0

In [183]:
class Node:
    def __init__(self):
        self.tt,self.tn,self.nt,self.nn=0,0,0,0
    
    def pred(self,x):
        if x == 1: return 1
        else     : return 0
    
    def calc(self,X,y):
        for i,x in enumerate(X):
            p = self.pred(x)
            if y[i] == 1 and p == 1: self.tt += 1
            if y[i] == 1 and p == 0: self.tn += 1
            if y[i] == 0 and p == 1: self.nt += 1
            if y[i] == 0 and p == 0: self.nn += 1

In [178]:
def gini_impurity(node):
    n_left  = node.tt + node.tn
    n_right = node.nt + node.nn
    gini_left  = 1 - (node.tt/n_left)**2  - (node.tn/n_left)**2
    gini_right = 1 - (node.nt/n_right)**2 - (node.nn/n_right)**2
    n       = n_left  + n_right
    return (n_left/n)*gini_left + (n_right/n)*gini_right

In [166]:
node = Node()

In [167]:
node.calc(data['age'], data['survived'])

In [172]:
age_impurity = gini_impurity(node)

0.30000000000000004

In [179]:
node = Node()

In [184]:
node.calc(data['sex'], data['survived'])

In [185]:
sex_impurtiy = gini_impurity(node)

In [186]:
sex_impurtiy

0.30000000000000004