In [1]:
import numpy as np
import pandas as pd

In [9]:
def find_entropy(df):
    entropy = 0
    target = df.keys()[-1]
    print(target)
    target_values = df[target].unique()
    print(target_values)
    
    for value in target_values:
        prob = len(df[df[target]==value])/len(df) # no. of instances where in df we find df[target] == value
        entropy += -(prob * np.log2(prob))
    
    return entropy

In [3]:
df = pd.read_csv('weather.csv')

In [10]:
find_entropy(df)

Play
[0 1]


0.9402859586706311

In [35]:
def find_avg_info_entropy(df, attribute):
    attribute_values = df[attribute].unique() # no. of unique values in attribute
    target = df.keys()[-1]
    target_values = df[target].unique() # no. of unique values in target
    avg_info_entropy = 0
    print(attribute_values)
    print(target_values)
    for value1 in attribute_values:
        entropy_subsample = 0
        for value2 in target_values:
            num = len(df[attribute][df[attribute] == value1][df[target] == value2])
            den = len(df[attribute][df[attribute] == value1])
            prob = num/den
            entropy_subsample += -(prob * np.log2(prob + 1e-7))
        weight =  den/len(df)
        avg_info_entropy += weight * entropy_subsample
    return avg_info_entropy

In [36]:
find_avg_info_entropy(df, 'Outlook')

['rainy' 'overcast' 'sunny']
[0 1]


0.6935358915770655

In [37]:
def find_winner(df):
    attributes = df.keys()[:-1] # ALL EXCEPT LAST
    IG = []
    for attribute in attributes:
        IG.append(find_entropy(df) - find_avg_info_entropy(df, attribute))
    return df.keys()[:-1][np.argmax(IG)]

In [38]:
find_winner(df)

Play
[0 1]
['rainy' 'overcast' 'sunny']
[0 1]
Play
[0 1]
['hot' 'mild' 'cool']
[0 1]
Play
[0 1]
['high' 'normal']
[0 1]
Play
[0 1]
[0 1]
[0 1]


'Outlook'

In [51]:
def training(df, tree=None):
    print("----")
    target = df.keys()[-1]
    attribute = find_winner(df)
    attribute_values = df[attribute].unique()
    if tree is None:
        tree = {}
        tree[attribute] = {}
    for value in attribute_values:
        sub_df = df[df[attribute] == value].reset_index(drop = True)
        print(sub_df)
        clvalue, count = np.unique(sub_df[target], return_counts=True)
        print(f"clvalue = {clvalue}")
        print(f"count = {count}")
        if len(count) == 1:
            tree[attribute][value] = clvalue[0]
        else:
            tree[attribute][value] = training(sub_df)
    return tree

In [52]:
tree = training(df)

----
Play
[0 1]
['rainy' 'overcast' 'sunny']
[0 1]
Play
[0 1]
['hot' 'mild' 'cool']
[0 1]
Play
[0 1]
['high' 'normal']
[0 1]
Play
[0 1]
[0 1]
[0 1]
  Outlook  Temp Humidity  Windy  Play
0   rainy   hot     high      0     0
1   rainy   hot     high      1     0
2   rainy  mild     high      0     0
3   rainy  cool   normal      0     1
4   rainy  mild   normal      1     1
clvalue = [0 1]
count = [3 2]
----
Play
[0 1]
['rainy']
[0 1]
Play
[0 1]
['hot' 'mild' 'cool']
[0 1]
Play
[0 1]
['high' 'normal']
[0 1]
Play
[0 1]
[0 1]
[0 1]
  Outlook  Temp Humidity  Windy  Play
0   rainy   hot     high      0     0
1   rainy   hot     high      1     0
2   rainy  mild     high      0     0
clvalue = [0]
count = [3]
  Outlook  Temp Humidity  Windy  Play
0   rainy  cool   normal      0     1
1   rainy  mild   normal      1     1
clvalue = [1]
count = [2]
    Outlook  Temp Humidity  Windy  Play
0  overcast   hot     high      0     1
1  overcast  cool   normal      1     1
2  overcast  mild     high 

In [44]:
tree

{'Outlook': {'rainy': {'Humidity': {'high': 0, 'normal': 1}},
  'overcast': 1,
  'sunny': {'Windy': {0: 1, 1: 0}}}}

In [55]:
def prediction(instance, tree):
    for attribute in tree.keys():
        value = instance[attribute]
        tree = tree[attribute][value]
        if type(tree) is dict:
            predict = prediction(instance, tree)
        else: # leaf node found (a value is returned)
            predict = tree
            break
    return predict

In [57]:
test = pd.read_csv('weather_test.csv')

In [58]:
test

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,rainy,cool,high,1,0
1,overcast,mild,normal,0,1


In [60]:
Y_predict = []
for i in range(len(test)):
    instance = test.iloc[i,:]
    predicted_value = prediction(instance, tree)
    print(f"for {instance}: {predicted_value}")
    Y_predict.append(predicted_value)

for Outlook     rainy
Temp         cool
Humidity     high
Windy           1
Play            0
Name: 0, dtype: object: 0
for Outlook     overcast
Temp            mild
Humidity      normal
Windy              0
Play               1
Name: 1, dtype: object: 1


In [61]:
Y_predict

[0, 1]