# **Decision Tree**

# Decision Trees are one of the most powerful [supervised algorithm](https://en.wikipedia.org/wiki/Supervised_learning) that we have ever discovered.
# Intuition behind decision tree is to divide datasets into smaller datasets based on some feature until we reached down to dataset that can be uniquely classified.
# As a name suggest , In this algorithm we'll devide datasets based on some kind of decision and we'll create tree structure.
![](http://i.ibb.co/ZTZcjbm/decision.png)
# The decision to split at each node is made based on the metric called purity. 
# A node is 100% impure when a node is split evenly 50/50 and 100% pure when all of its data belongs to a single class.
# In order to optimize our model we need to reach maximum purity and avoid impurity

# Let's learn about some of the purity metric :

# 1. gini : $G = \sum_{i=0}^{n-1} p_{i} * (1 - p_{i})$ 
      
# 2. Entropy (Information gain) : $E = - \sum_{i=0}^{n-1} p_{i} * log(p_{i})$
# where $p_{i}$ is the probability of ith class and $n$ is total number of samples

# Let's implement decision Tree based on gini :

In [None]:
# import
import math
import numpy as np
import pandas as pd # for manipulating dataset
from sklearn import tree # for decision tree and to plot tree
import matplotlib.pyplot as plt

In [None]:
# implement gini
def gini(samples):
    sample_sum = sum(samples)
    score = 0
    probs = []
    for sample in samples:
        prob = sample / sample_sum
        # probability of every sample
        probs.append(prob)
    
    for prob in probs:
        score += (prob * (1 -  prob))
    return score

In [None]:
# implement entroty (info. gain) function
def entropy(samples):
    sample_sum = sum(samples)
    score = 0
    probs = []
    for sample in samples:
        prob = sample / sample_sum
        probs.append(prob)
    
    for prob in probs:
        if prob > 0:
            score += (prob * math.log(prob))
    return -1 * score

In [None]:
# we need weighted gini for node
def weighted(p1,p2,n1,n2):
    # n1 = samples with p1 prob
    # n2 = samples with p2 prob
    return (p1 * n1 + p2 * n2) / (n1 + n2)

In [None]:
# we're going to use titanic datasets
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df.head()

In [None]:
# we'll only use Sex , Pclass and Survived(target column) column
col = ['Survived' , 'Pclass' , 'Sex']
df = df[col]
df.head()

In [None]:
# Sex column contains categorical value male and female so we'll convert male to 1 and female to 0
mapping = {
    'male' : 1,
    'female' : 0
}
df['Sex'] = df.Sex.map(mapping)
df.head()

In [None]:
df.Survived.value_counts() # total we have 549 data with 0 label and 342 data with 1 label

In [None]:
# gini of whole data
# starting gini
gini([549 , 342]) 

In [None]:
# pclass contain only 3 value 1,2,3
df.Pclass.unique()

In [None]:
# so now what kinds of condition we can make ?

# let's say we have pclass with : 1 , 2 , 3
# so we can make split at :         |   |
#                               (1.5)  (2.5)

# and sex has 2 value : 0 , 1 so we can make split at 0.5


![](http://i.ibb.co/hdK8LCR/pclass.png)

In [None]:
# if the condition is true
x = df[df.Pclass <= 1.5]
print(f"total elements with pclass <= 1.5  : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([80,136])}\n")

# if the condition is false
x = df[df.Pclass > 1.5]
print(f"total elements with pclass > 1.5  : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([469,206])}\n")

print(f"Weighted Gini : {weighted(0.466 , 0.424 , 216 , 675)}")

![](http://i.ibb.co/X8WSPWT/pclass2.png)

In [None]:
# if the condition is true
x = df[df.Pclass <= 2.5]
print(f"total elements with pclass <= 2.5  : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([177,223])}\n")

# if the condition is false
x = df[df.Pclass > 2.5]
print(f"total elements with pclass > 2.5  : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([372,119])}\n")

print(f"Weighted Gini : {weighted(0.493 , 0.367 , 400 , 491)}")

![](http://i.ibb.co/DRHLCHJ/sex.png)

In [None]:
# if the condition is true
x = df[df.Sex <= 0.5]
print(f"total elements with sex <= 0.5  : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([81,233])}\n")

# if the condition is false
x = df[df.Sex > 0.5]
print(f"total elements with sex > 0.5  : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([468,109])}\n")

print(f"Weighted Gini : {weighted(0.382 , 0.306 , 314 , 577)}")

In [None]:
# so sex has minimum weighted gini so it becomes our starting condition

# gini : 0.383 , (81,233)
# gini : 0.306 , (468,109)

# now lets check for sex <= 0.5 and pclass <= 1.5
x = df[ (df.Sex <= 0.5) & (df.Pclass <= 1.5)]
print(f"total elements with sex <= 0.5 and pclass <= 1.5 : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([3,91])}\n")

# now lets check for sex <= 0.5 and pclass > 1.5
x = df[ (df.Sex <= 0.5) & (df.Pclass > 1.5)]
print(f"total elements with sex <= 0.5 and pclass > 1.5 : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([78,142])}\n")

print(f"weighted gini : {weighted(0.061 , 0.457 , 94 , 220)}")

In [None]:
# now lets check for sex <= 0.5 and pclass <= 2.5
x = df[ (df.Sex <= 0.5) & (df.Pclass <= 2.5)]
print(f"total elements with sex <= 0.5 and pclass <= 2.5 : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([9,161])}\n")

# now lets check for sex <= 0.5 and pclass > 2.5
x = df[ (df.Sex <= 0.5) & (df.Pclass > 2.5)]
print(f"total elements with sex <= 0.5 and pclass > 2.5 : {x.shape[0]}")
print(x.Survived.value_counts())
print(f"Gini : {gini([72,72])}\n")

print(f"weighted gini : {weighted(0.1 , 0.5 , 170 , 144)}")

In [None]:
# here weighted average for pclass<=2.5 is less then pclass<=1.5 we'll select pclass <= 2.5
# and this process will last untill we reached leaf nodes

# now let's implement whole tree with sklearn

features = ['Sex' , 'Pclass']
# model intialization
model = tree.DecisionTreeClassifier()
# fitting model
model.fit(df[features] , df.Survived)

plt.figure(figsize = (20,10))
tree.plot_tree(model , feature_names = features)
plt.show()

In [None]:
# Problem with Decision Tree  : 

# As the number of split increases , complexity of DT will also increase.
# In general simple DTs will be preferred over complex DTS.
# main problem with DT is that if tree will become more complex and classify every data point with 100% accuracy then there might arises problem of overfitting
# In Below image we'll prefer Black line over Green line.


![](http://i.ibb.co/09fqDcB/over.png)

In [None]:
# to handle problem of overfitting we'll try to contro the depth of the decision tree , it might decrease training accuracy 
# but it'll surely increase testing accuracy
# In other words , do splitting until we get 96% or 99% pure class instead of 100% pure class

# in sklearn we can provide max_depth , for e.g.

model = tree.DecisionTreeClassifier(max_depth = 7)

# we can figure out optimal max_depth by cross validation or may be using some grid search algorithm

# This whole notebook is based on @abhishek thakur's [Youtube](https://youtu.be/1DMWkIJRivo) video.
# reference : Blog from [towards data science](https://towardsdatascience.com/the-complete-guide-to-decision-trees-28a4e3c7be14)