### Imports needed

In [44]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.tree import export_graphviz
import pydotplus

from sklearn.metrics import confusion_matrix

import numpy as np

## DecisionTree for OR operation

In [45]:
features = ['X1', 'X2']
classes = np.array([True, False])

X = np.array([[True, False], [False, True], [False, False], [True, True]])
Y = np.array([True, True, False, True])

print(X)
print(Y)

[[ True False]
 [False  True]
 [False False]
 [ True  True]]
[ True  True False  True]


In [46]:
def entropy_node(Y):
    total_freq = len(Y)
    
    # store different classes possible with its frequency
    class_freq = {}
    for c in Y:
        class_freq[c] = class_freq.get(c,0) + 1
        
    # Now we calculate entropy or info required
    info_req =  0
    for k in class_freq:
        prob_k = class_freq[k]/total_freq
        if prob_k!=0:
            info_req += (-1 * prob_k * np.log2(prob_k))
    return info_req

In [47]:
# calculate entropy(info required) after spliting on basis of feature having index f_i

def info_fi(X, Y, f_i):
    entropy_after_split = 0
    
    # feature on which we goona split
    feature = X[:, f_i] # feature corresponds to index f_i
    f_set = set(feature)
    for c in f_set:
        Y_c = Y[np.where(feature == c)]
        entropy_after_split += ( (len(Y_c)/len(Y)) * entropy_node(Y_c) )
        
    return entropy_after_split

In [48]:
# calculate split_info of feature having index f_i

def find_split_info(X, Y, f_i):
    split_info = 0
    size = len(Y)
    
    # feature on which we goona split
    feature = X[:, f_i] # feature corresponds to index f_i
    f_set = set(feature)
    for c in f_set:
        Y_c = Y[np.where(feature == c)]
        split_info += ( -1*(len(Y_c)/size) * np.log2(len(Y_c)/size) )
        
    return split_info

In [49]:
def DecisionTree(X, Y, features, level=0):
    # Print some info as required
    print("Level",level) 
    
    # store different classes possible with its frequency
    class_freq = {}
    for c in Y:
        class_freq[c] = class_freq.get(c,0) + 1
    
    for c in class_freq:
        if class_freq[c] != 0:
            print("Count of",c,"=",class_freq[c])
    
    # Find current entropy
    entropy_current = entropy_node(Y)
    
    
    # Base case
    # If node is pure, 
    if len(set(Y))==1 :
        print("Current Entropy is = 0.0")
        print("Reached Leaf Node")
    
    # If no feature left to split
    elif len(features) == 0:
        print("Current Entropy is =",entropy_current)
        print("Reached Leaf Node")
    
    else:
        print("Current Entropy is =",entropy_current)
        
        # Find max info_gain
        max_gain_ratio = 0
        max_gain_ratio_feature = features[0]
        
        # check info_gain for each feature
        for f in features:
            info_after_split_f = info_fi(X, Y, feature_indices[f])
            info_gain_f = entropy_current - info_after_split_f
            split_info_f = find_split_info(X, Y, feature_indices[f])
            gain_ratio_f = info_gain_f/split_info_f
            
            if gain_ratio_f>max_gain_ratio:
                max_gain_ratio = gain_ratio_f
                max_gain_ratio_feature = f
                
        print("Splitting on feature",max_gain_ratio_feature, "with gain ratio", max_gain_ratio)
        
        # Now split on basis of max_gain_ratio_feature and call recursively
        # 1st remove this max_gain_ratio_feature from current features list
        new_features = []
        for f in features:
            if f != max_gain_ratio_feature:
                new_features.append(f)
        #features = new_features # Now features list doesn't contain max_gain_ratio_feature
                
        f_i = feature_indices[max_gain_ratio_feature] # bcz this index(f_i) gives us the exact column correspond to required featue 
        f_i_feature = X[:, f_i] # feature corresponds to index f_i
        f_set = set(f_i_feature)
        for c in f_set:
            X_c = X[np.where(f_i_feature == c)]
            Y_c = Y[np.where(f_i_feature == c)]
            
            # Recursive calls
            print()
            DecisionTree(X_c, Y_c, new_features, level+1)
        

In [51]:
'''
    Store feature names and its corresponding indices initially. Otherwise in every recusive call we need to make new X ndarray
    which increases complexity.
'''
feature_indices = {}
for i in range(len(features)):
    feature_indices[features[i]] = feature_indices.get(features[i],0) + i
    
# Call Decision Tree
DecisionTree(X,Y,features)

Level 0
Count of True = 3
Count of False = 1
Current Entropy is = 0.8112781244591328
Splitting on feature X1 with gain ratio 0.31127812445913283

Level 1
Count of True = 1
Count of False = 1
Current Entropy is = 1.0
Splitting on feature X2 with gain ratio 1.0

Level 2
Count of False = 1
Current Entropy is = 0.0
Reached Leaf Node

Level 2
Count of True = 1
Current Entropy is = 0.0
Reached Leaf Node

Level 1
Count of True = 2
Current Entropy is = 0.0
Reached Leaf Node
