In [1]:
import numpy as np
import pandas as pd

### Decision Trees

Decision Tree is an ML algorithm that can be used for regression or classification problems. 

1. They come under the class of tree-based models
2. Are explainable as we can build the hierarchy of decision rules
3. Are prone to overfitting, which is why we use ensemble tree-based models - boosting and bagging

In [2]:
df= pd.read_csv('dt_dataset.csv')
df.dropna(inplace=True)
df

Unnamed: 0,Outlook,Temperature,Humidity,WindSpeed,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rainy,Mild,High,Weak,Yes
4,Rainy,Cool,Normal,Weak,Yes
5,Rainy,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rainy,Mild,Normal,Weak,Yes


#### Impurity Measures:
1. **Shannon Entropy:** Used in ID3 and C4.5
2. **Gini Index:** Used in CART 

#### Spitting Criterion: 
1. **Information Gain:** 
    1. Difference between information in parent and child node. 
    2. Criterion used in ID3.
2. **Gain Ratio:** 
    1. Biased towards attributes with lower cardinality
    2. Can be said as normalized version of information gain as it divides the information gain with entropy. 
    3. So if a multi-valued attribute carries lot of information, it may also be highly heterogenous having high entropy. In such cases, gain ratio is better criterion. 
    4. Gain ratio is used in C4.5.

In [3]:
def get_probabilities(df, target):
    probs = df[target].value_counts() / len(df)
    return probs.values

def get_entropy(probs):
    total_entropy = 0

    for prob in probs:
        if prob != 0:
            total_entropy += prob * np.log2(prob)
    return -total_entropy  
    
def get_parent_information(df, target):
    probs = get_probabilities(df, target)
    parent_info = get_entropy(probs)
    return parent_info  

def get_feature_information(df, feature, target):
    entities = df[feature].value_counts().index.tolist()
    counts = df[feature].value_counts().values.tolist()
    total_count = sum(counts)

    total_info = 0
    for entity, count in zip(entities, counts):
        iter_df = df[df[feature]==entity]
        probs = get_probabilities(iter_df, target)
        entity_info = get_entropy(probs)
        total_info += count / total_count * entity_info
    return total_info      

In [4]:
# criteria for tree splitting
# information gain
def get_max_information_gain(parent_info, feature_infos):
    gains = [parent_info - feature_info for feature_info in feature_infos]
    idx = gains.index(max(gains))
    return idx

# gain ratio
def get_max_gain_ratio(parent_info, feature_infos, feature_entropies):
    gains = [parent_info - feature_info for feature_info in feature_infos]
    
    gains = np.array(gains)
    feature_entropies = np.array(feature_entropies)
    feature_entropies = np.clip(feature_entropies, 1E-8, None)

    gain_ratios = gains / feature_entropies
    gain_ratios = gain_ratios.tolist()
    idx = gain_ratios.index(max(gain_ratios))
    return idx        

In [5]:
# getting splitting feature for data
def get_split(df, target, splitting_criterion = 'information_gain'):
    features = df.columns.tolist()
    features.remove(target)    

    feature_infos = []
    feature_entropies = []
    for feature in features:
        parent_info = get_parent_information(df, target)
        feature_info = get_feature_information(df, feature, target)
        feature_infos.append(feature_info)

        # for gain ratio
        feature_probs = get_probabilities(df, feature)
        feature_entropy = get_entropy(feature_probs)
        feature_entropies.append(feature_entropy)

        if splitting_criterion == 'information_gain':
            feature_idx = get_max_information_gain(parent_info, feature_infos)
        else:
            feature_idx = get_max_gain_ratio(parent_info, feature_infos, feature_entropies)
            
        splitting_feature = features[feature_idx]
    return splitting_feature    

In [6]:
class TreeNode:
    def __init__(self, data, path_data):
        self.val = data
        self.parent = None
        self.path = path_data
        self.children = []

    def get_level(self):
        iter_node = self
        count = 0

        while iter_node.parent:
            iter_node = iter_node.parent
            count += 1
        return count

    def print_tree(self):
        level = self.get_level()
        space = '       ' * level

        if self.parent:
            print(f'{space} {self.path}: {self.val}')
        else:
            print(f'{space} {self.val}')        
            
        if self.children:
            for child in self.children:
                child.print_tree()

In [7]:
def decision_tree_ID3(df, target, root = None, entity = None):
    if df[target].nunique() == 1:
        val = df[target].unique()[0]
        return TreeNode(val, entity)
    
    else:
        splitting_feature = get_split(df, target, splitting_criterion = 'information_gain')
        root = TreeNode(splitting_feature, entity)
        
        # different entities for splitting feature
        entities = df[splitting_feature].unique().tolist()

        for entity in entities:
            df_child = df.loc[df[splitting_feature] == entity, :].reset_index(drop = True)
            df_child.drop(splitting_feature, axis = 1, inplace=True)      

            # recursively build child trees
            child = decision_tree_ID3(df_child, target, root = None, entity = entity)
            root.children.append(child)
            child.parent = root
        return root
            
target = 'Play'            
tree = decision_tree_ID3(df, target)            
tree.print_tree()

 Outlook
        Sunny: Humidity
               High: No
               Normal: Yes
        Overcast: Yes
        Rainy: WindSpeed
               Weak: Yes
               Strong: No


In [8]:
def decision_tree_C4_5(df, target, root = None, entity = None):
    if df[target].nunique() == 1:
        val = df[target].unique()[0]
        return TreeNode(val, entity)
    
    else:
        splitting_feature = get_split(df, target, splitting_criterion = 'gain_ratio')
        root = TreeNode(splitting_feature, entity)
        
        # different entities for splitting feature
        entities = df[splitting_feature].unique().tolist()

        for entity in entities:
            df_child = df.loc[df[splitting_feature] == entity, :].reset_index(drop = True)
            df_child.drop(splitting_feature, axis = 1, inplace=True)      

            # recursively build child trees
            child = decision_tree_C4_5(df_child, target, root = None, entity = entity)
            root.children.append(child)
            child.parent = root
        return root
            
target = 'Play'            
tree = decision_tree_C4_5(df, target)    
tree.print_tree()        

 Outlook
        Sunny: Humidity
               High: No
               Normal: Yes
        Overcast: Yes
        Rainy: WindSpeed
               Weak: Yes
               Strong: No


In [9]:
def get_gini_impurity(probs):
    gini_index = 0

    for prob in probs:
        gini_index += prob**2
    gini_index = 1 - gini_index
    return gini_index

def get_feature_gini_index(df, feature, target):
    entities = df[feature].value_counts().index.tolist()
    counts = df[feature].value_counts().values.tolist()
    total_count = sum(counts)

    gini_index = 0
    for entity, count in zip(entities, counts):
        iter_df = df[df[feature]==entity]
        probs = get_probabilities(iter_df, target)
        entity_gini_index = get_gini_impurity(probs)
        gini_index += count / total_count * entity_gini_index
    return gini_index

# getting splitting feature for data
def get_split_CART(df, target):
    features = df.columns.tolist()
    features.remove(target)    

    feature_gini_indexes = []
    for feature in features:
        feature_gini_index = get_feature_gini_index(df, feature, target)
        feature_gini_indexes.append(feature_gini_index)

    # minimizing Gini Index
    feature_idx = feature_gini_indexes.index(min(feature_gini_indexes))            
    splitting_feature = features[feature_idx]
    return splitting_feature            

In [10]:
def decision_tree_CART(df, target, root = None, entity = None):
    if df[target].nunique() == 1:
        val = df[target].unique()[0]
        return TreeNode(val, entity)
    
    else:
        splitting_feature = get_split_CART(df, target)
        root = TreeNode(splitting_feature, entity)
        
        # different entities for splitting feature
        entities = df[splitting_feature].unique().tolist()

        for entity in entities:
            df_child = df.loc[df[splitting_feature] == entity, :].reset_index(drop = True)
            df_child.drop(splitting_feature, axis = 1, inplace=True)      

            # recursively build child trees
            child = decision_tree_CART(df_child, target, root = None, entity = entity)
            root.children.append(child)
            child.parent = root
        return root
            
target = 'Play'            
tree = decision_tree_CART(df, target)    
tree.print_tree()        

 Outlook
        Sunny: Humidity
               High: No
               Normal: Yes
        Overcast: Yes
        Rainy: WindSpeed
               Weak: Yes
               Strong: No


### Summary

Different implementations of Decision Trees based on splitting and impurity criterion:
1. ID3:
    1. **Impurity criterion:** Shannon/Information Entropy
    2. **Splitting criterion:** Information Gain
2. C4.5:
    1. **Impurity criterion:** Shannon/Information Entropy
    2. **Splitting criterion:** Gain Ratio
3. CART:
    1. **Impurity criterion:** Gini Index
    2. **Splitting criterion:** Minimizing Gini Index