### Decision Trees

Write functions to calculate the entropy and information gain of a subtree

In [48]:
import numpy as np
from sklearn.preprocessing import normalize

def entropy_function(dist):

    '''
    Calculates and returns the entropy of a node distribution represented by a 1D numpy array.
    '''
    
    # Apply the log2 function to the node and use a masked array to account for log2(0)
    dist_log_ma = np.ma.log2(dist)
    dist *= -(dist_log_ma)
    
    # Add all transformed distrubtions to get the total entropy
    entropy = np.sum(dist)
    
    return entropy

def info_gain(root_split, child_splits):
    
    '''
    Calculate the information gain of the feature by inputing the feature splits of the root and children.
    '''
    
    # Calculate the weights of the child nodes
    weights = np.sum(child_splits, axis=1)/np.sum(root_split)
    
    # Normalize both the root and children nodes before calculating the entropy
    normed_root = normalize(root_split.reshape(-1,1), axis=0, norm='l1').T
    normed_children = normalize(child_splits, axis=1, norm='l1')
        
    # Get the entropy of the root node
    H_root = entropy_function(normed_root)
    
    # Get the of the children nodes and multiply each child node by the weight
    H_child = np.apply_along_axis(entropy_function, 1, normed_children)
    H_child *= weights
    
    # Calculate information gain
    IG = H_root - np.sum(H_child)
    
    return IG

root = np.array([22,18])
children = np.array([[0,0], [4,17], [1,0], [8,0], [9,1]])

print(info_gain(root, children))

0.506730616163033
