In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def chooseBestFeature(df):
    """
    From the given dataframe selects the root node
    """
    categorical = df.columns[-1]
    
    feature_array = df.columns[:-1]
    
    ig_array = dict()
    
    for i in feature_array:
        ig_array[i] = getInfoGainOverFeature(df,i)
        
    # print(ig_array)
    return max(ig_array, key=ig_array.get),ig_array[max(ig_array, key=ig_array.get)]   

In [3]:
def getInfoGainOverFeature(df,feature):
    
    """
    Return the information gain for a given part of dataframe
    on splitting upon a feature
    """
    categorical = df.columns[-1]
    
    
    dataframe_array = []

    for value in df[feature].unique():
        dataframe_array.append(df[df[feature]==value])
    
    feature_dict = dict()
    entropy_array = []
    
    for dataframe in dataframe_array:
        entropy_array.append(entropy(dataframe))
        
    
    entropy_parent = entropy(df)
    
    total_values = len(df)
    
    factor_array = []
    for i in range(len(entropy_array)):
        factor_array.append( ((len(dataframe_array[i])/total_values))*entropy_array[i] )
    
    return entropy_parent-sum(factor_array)
    


In [4]:
def mylog(num):
    """
    Returns log base 2 of number
    """
    if num!=0 :
        return np.log2(num)
    return 0

In [5]:
def entropy(df):
    """
    Returns entropy of a part of dataframe
    """
    categorical = df.columns[-1]
    #use dict
    count = dict()
    for  x in df[categorical].unique():
        count[x]=0
    for  x in df[categorical].values:
        count[x]+=1

    total_Values = len(df[categorical].values) 
    
    return -1*sum([(count[i]/total_Values)*mylog(count[i]/total_Values) for i in count.keys() ])


In [144]:
def generateTree(df):

    if len(df[df.columns[-1]].unique())==1:
        return (df[df.columns[-1]].unique())[0]
    
    else:
        best_feature , info_gain = chooseBestFeature(df)
        tree = {best_feature : {}}
        
        for i in df[best_feature].unique():
            sub_df = df[df[best_feature]==i]
            sub_tree = buildTree(sub_df)
            tree[best_feature][i] = sub_tree
        
        return tree   

In [None]:
df = pd.read_table('tennis.txt')

In [141]:
tree = generateTree(df)

In [143]:
tree

{'outlook': {'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}},
  'overcast': 'yes',
  'rain': {'wind': {'weak': 'yes', 'strong': 'no'}}}}