In [40]:
# Decision trees are supervised models.
# Can be used for Both classification/regression tasks.
# They are simple tree like structure (hierarchical in nature).
# Decision trees can be thought as nested if else conditions.
# Highly interpretable models, easy to explain the workings.

In [41]:
# Why Decision Trees are popular?
# Easy to interpret and represent.
# Mimic human level thought. tries to take decisions like a human does.
# Ensemble models are made up of Decision trees that performs even better than individual Decision trees.
# When feature are categorical, Decision Trees are preffered over other models.

In [42]:
# Entropy - Entropy is measure of randomness of a system.

In [2]:
import numpy as np

In [12]:
def entropy(var):
    N = var.shape[0]
    values, counts = np.unique(var, return_counts=True)

    ent = 0.0

    for i in counts:
        p = i/N
        ent += (p * np.log2(p))

    return -ent

In [8]:
Y = np.array([1,0,0,1,0,1,0,1])

In [9]:
X = np.array([1,1,1,1,1,1,1])

In [13]:
entropy(Y)

1.0

In [14]:
entropy(X)

-0.0

In [33]:
# data split - divide it  into multiple nodes

In [22]:
import pandas as pd

In [24]:
df = pd.read_csv('golf.csv')

In [25]:
df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [31]:
def divide_data(data, feature):

    # Here we are working with data frames using Pandas and NumPy
    DATA = {}
    feat_values = list(df[feature].value_counts().index)
    occurence = list(df[feature].value_counts())

    for val in feat_values:
        DATA[val] = {'data' : pd.DataFrame([], columns=data.columns), 'len':0}

    for ix in range(data.shape[0]):
        val = data[feature].iloc[ix]

        # DATA[val]['data'] = DATA[val]['data'].append(data.iloc[ix])
        # append is depricated, so use concat
        DATA[val]['data'] = pd.concat([DATA[val]['data'], pd.DataFrame([data.iloc[ix]], columns=data.columns)])
        
        idx = feat_values.index(val)
        DATA[val]['len'] = occurence[idx]

    return DATA

In [32]:
divide_data(df, 'Outlook')

{'sunny': {'data':    Outlook Temperature Humidity  Windy Play
  0    sunny         hot     high  False   no
  1    sunny         hot     high   True   no
  7    sunny        mild     high  False   no
  8    sunny        cool   normal  False  yes
  10   sunny        mild   normal   True  yes,
  'len': 5},
 'rainy': {'data':    Outlook Temperature Humidity  Windy Play
  3    rainy        mild     high  False  yes
  4    rainy        cool   normal  False  yes
  5    rainy        cool   normal   True   no
  9    rainy        mild   normal  False  yes
  13   rainy        mild     high   True   no,
  'len': 5},
 'overcast': {'data':      Outlook Temperature Humidity  Windy Play
  2   overcast         hot     high  False  yes
  6   overcast        cool   normal   True  yes
  11  overcast        mild     high   True  yes
  12  overcast         hot   normal  False  yes,
  'len': 4}}

In [34]:
# Information gain

In [35]:
def information_gain(data, feature):
    examples = data.shape[0]
    
    DATA = divide_data(data, feature)
    
    keys = DATA.keys()
    
    
    ent_of_children = 0.0
    
    for key in keys:
        ent_of_children +=  ( (DATA[key]['len']/examples) * entropy(DATA[key]['data']['Play']) )
        
    info_gain = entropy(data['Play']) - ent_of_children
    return info_gain

In [36]:
information_gain(df, 'Outlook')

0.24674981977443933

In [37]:
information_gain(df, 'Windy')

0.04812703040826949

In [38]:
information_gain(df, 'Temperature')

0.02922256565895487

In [39]:
information_gain(df, 'Humidity')

0.15183550136234159

In [43]:
# Constructing a Decision Tree

In [45]:
# Stopping Condition

# 1. Pure Node
# 2. Can't grow the tree anymore because of lack of points.
# 3. If already reach a max depth.

In [50]:
class DecisionTree:
    
    # constructor
    def __init__(self, depth=0, max_depth=5):
        # Creating a Node
        self.children = {}
        self.fkey = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self, data):
        features = ['Outlook', 'Temperature', 'Humidity', 'Windy']
        
        info_gains = []
        
        for f in features:
            i_gain = information_gain(data, f)
            info_gains.append(i_gain)
            
        # finding the best feature
        self.fkey = features[np.argmax(info_gains)]
        
        #Spliting the Data
        DATA = divide_data(data, self.fkey)
        
        
        
        # Giving a target label to the Node
        labels = list(data['Play'].value_counts().index)
        freq = list(data['Play'].value_counts().values)
        
        self.target = labels[np.argmax(freq)]
        
        
    
        
        ###### STOPPING CONDITIONS ######
        
        have_data = 0
        keys = DATA.keys()
        
        for key in keys:
            if DATA[key]['len'] > 0:
                have_data +=1
        
        # 1.if it is pure node, if the entropy is zero then you cannot reduce it further, with all the branch nodes as 'yes'
        if have_data<2:
            return 
        
    
        # 2. Early Stop if you have reached max depth
        if(self.depth >= self.max_depth):
            return
        
        
        print("\t"*self.depth + "Making tree with - ", self.fkey)
        
        
        # Recursively train child Node
        for key in keys:
            new_data = DATA[key]['data']
            self.children[key] = DecisionTree(depth = self.depth + 1)
            self.children[key].train(new_data)
            
        return  
    
    
    def predict(self, test):
        if self.children == {}:
            return self.target
        return self.children[test[self.fkey][0]].predict(test)

In [51]:
# Explore the Model

In [52]:
model = DecisionTree()

In [54]:
model.train(df)

Making tree with -  Outlook
	Making tree with -  Humidity
	Making tree with -  Windy


In [55]:
model

<__main__.DecisionTree at 0x1b9db803c50>

In [57]:
model.target

'yes'

In [58]:
model.fkey

'Outlook'

In [59]:
model.children

{'sunny': <__main__.DecisionTree at 0x1b9da345100>,
 'rainy': <__main__.DecisionTree at 0x1b9d75ff080>,
 'overcast': <__main__.DecisionTree at 0x1b9da346b40>}

In [60]:
model.children['sunny']

<__main__.DecisionTree at 0x1b9da345100>

In [61]:
model.children['sunny'].fkey

'Humidity'

In [62]:
model.children['sunny'].children

{'high': <__main__.DecisionTree at 0x1b9db5816d0>,
 'normal': <__main__.DecisionTree at 0x1b9db6aba70>}

In [63]:
model.children['sunny'].children['high'].children

{}

In [64]:
model.children['overcast'].target

'yes'

In [65]:
model.children['overcast'].children

{}

In [66]:
# Prediction Time

In [67]:
x_test = pd.DataFrame([['sunny', 'hot', 'normal', False]], columns=list(df.columns.values[:-1]))

In [68]:
x_test

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,sunny,hot,normal,False


In [69]:
model.predict(x_test)

'yes'