In [52]:
import pandas as pd
import numpy as np

## Tennis Dataset

In [53]:
tennis=pd.read_csv('play_tennis.csv')

In [54]:
tennis

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


#### Entropy of parent

In [55]:
play=tennis['play'].value_counts()
play=play/play.sum()
play

Yes    0.642857
No     0.357143
Name: play, dtype: float64

In [56]:
entropy_parent= sum([ -x*np.log(x) for x in play])
entropy_parent

0.6517565611726531

#### Entropies of columns

In [57]:
weighted_entropies=dict()
for column in tennis.columns[1:-1]:
    
    # For every column
    
    entropy_cols=[]
    weights=[]
    column_unique=tennis[column].unique()
    for unique in column_unique:
        
        # For every unique value in a column
        # We are filtering the data for that unique value
        out=tennis[tennis[column]==unique]
        
        # Finding out the weights for weighted average 
        weights.append(len(out)/len(tennis))
        
        # Entropies of columns for that unique value
        play=out['play'].value_counts()
        play=play/play.sum()
        entropy_col= sum([ -x*np.log(x) for x in play])
        entropy_cols.append(entropy_col)
   
    # The weighted average of the entropies
    weighted_entropy=np.dot(weights,entropy_cols)
    
    # Storing the weighted averages in a dictionary
    weighted_entropies[column]=weighted_entropy
weighted_entropies

{'outlook': 0.48072261929232607,
 'temp': 0.6315010221774208,
 'humidity': 0.5465122114944403,
 'wind': 0.6183974457364384}

#### Finding out the info gains of all the columns

In [58]:
info_gain=dict()

for key in weighted_entropies.keys():
    info_gain[key]=entropy_parent-weighted_entropies[key]
info_gain

{'outlook': 0.17103394188032706,
 'temp': 0.02025553899523236,
 'humidity': 0.10524434967821283,
 'wind': 0.033359115436214726}

The maximum info gain is of the "wind" column. Therefore, the first splitting will be done on "wind" column.

## Iris dataset

In [59]:
iris=pd.read_csv('Iris.csv')

In [60]:
iris.columns=['Id','SL','SW','PL','PW','Species']

In [61]:
iris

Unnamed: 0,Id,SL,SW,PL,PW,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


#### Entropy of parent

In [62]:
species=iris['Species'].value_counts()
species=species/species.sum()
species

Iris-setosa        0.333333
Iris-virginica     0.333333
Iris-versicolor    0.333333
Name: Species, dtype: float64

In [63]:
entropy_parent= sum([ -x*np.log(x) for x in species])
entropy_parent

1.0986122886681096

#### Entropy of every column

In [64]:
entropies=dict()
for column in iris.columns[1:-1]:
    
    # Sorting the dataframe based on a column
    sorted_data=iris.sort_values(column)
    
    
    weighted_entropies=[]
    for val in sorted_data[column][:-1]:
        
        # For every value in that column, separate the data
        data1=sorted_data[sorted_data[column]<=val]
        data2=sorted_data[sorted_data[column]>val]
        
        # Entropy for the first data
        species=data1['Species'].value_counts()
        species=species/species.sum()
        entropy1= sum([ -x*np.log(x) for x in species])
        
        # Entropy for the second data
        species=data2['Species'].value_counts()
        species=species/species.sum()
        entropy2= sum([ -x*np.log(x) for x in species])
        
        # Weighted entropies
        weighted_entropy=(len(data1)*entropy1+len(data2)*entropy2)/len(sorted_data)
        weighted_entropies.append(weighted_entropy)
   
    # Storing the entropies for every column in a dict
    entropies[column]=min(weighted_entropies)

In [65]:
entropies

{'SL': 0.7123680221988984,
 'SW': 0.9129102784746159,
 'PL': 0.4620981203732969,
 'PW': 0.4620981203732969}

#### Info gains of the columns

In [66]:
info_gain=dict()

for key in entropies.keys():
    info_gain[key]=entropy_parent-entropies[key]
info_gain

{'SL': 0.3862442664692112,
 'SW': 0.18570201019349364,
 'PL': 0.6365141682948127,
 'PW': 0.6365141682948127}

The Petal Length and Petal Width columns have maximum and same info gains. We can take either of them for splitting the data for the first time.