In [76]:
import pandas as pd
from collections import Counter
import numpy as np
import math
tennis_data = pd.read_csv('4_playtennis.csv')
tennis_data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [77]:
def compute_entropy(probs):
    return sum(-p*math.log(p,2) if p>0 else p for p in probs)

def entropy_labels(label_list):
    total = len(label_list)
    label_count = Counter(label_list)
    label_probs = [count/total for count in label_count.values()]
    return compute_entropy(label_probs)


In [78]:
def info_gain(df, splitting_attr, target_attr):
    
    df_split = df.groupby(splitting_attr)
    total_rows = len(df)

    ent_sum = df_split.agg({
        target_attr: [entropy_labels, lambda x: len(x)/total_rows]
    })[target_attr]

    ent_sum.columns = ['Entropy', 'Proportion']

    weighted_ent = sum(ent_sum['Entropy']*ent_sum['Proportion'])
    global_ent = entropy_labels(df[target_attr])

    return global_ent - weighted_ent


In [79]:
def build_tree(df,target_attr,remain_attr,majority_class=None):
    class_counts = Counter(df[target_attr])
    
    if len(class_counts) == 1:
        return next(iter(class_counts))

    elif df.empty or not remain_attr:
        return majority_class

    else:
        majority_class = max(class_counts, key=class_counts.get)
        info_gains = {attr: info_gain(df,attr,target_attr) for attr in remain_attr}
        
        best_attr = max(info_gains, key=info_gains.get)
        
        decision_tree = {best_attr: {}}
        
        for attr_v, subset in df.groupby(best_attr):
            subtree_attr = set(remain_attr) - {best_attr}
            subtree = build_tree(subset,target_attr,list(subtree_attr),majority_class)
            decision_tree[best_attr][attr_v] = subtree
        
        return decision_tree


In [80]:
ten_attr = list(tennis_data.columns)
ten_attr.remove('Play Tennis')
ten_attr
dt = build_tree(tennis_data, 'Play Tennis', ten_attr)
dt

{'Outlook': {'Overcast': 'Yes',
  'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
  'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}

In [87]:
def predict(instance, dt, default_class=None):
    
    if not dt or isinstance(dt, str):
        return dt if dt else default_class

    split_attr = next(iter(dt))
    k = dt[split_attr]
    m = instance.get(split_attr)

    return predict(instance, k.get(m,default_class), default_class)

In [88]:
new_data = pd.read_csv('4_playtennis.csv')
new_data = new_data[11:]
new_data['Predicted'] = new_data.apply(lambda row: predict(row,dt,'Unknown'),axis=1)
print(new_data)

     Outlook Temperature Humidity    Wind Play Tennis Predicted
11  Overcast        Mild     High  Strong         Yes       Yes
12  Overcast         Hot   Normal    Weak         Yes       Yes
13      Rain        Mild     High  Strong          No        No
