# Classification

In [2]:
import pandas as pd
import math
import textwrap

## Step 0
- You will get a csv file from us. Load it in your language/environment.
- Explore the data in it. Identify the input data $X$ and the labels.

In [3]:
data_frame: pd.DataFrame = pd.read_csv("data-cls.csv")
data_frame

Unnamed: 0,forecast,temperature,humidity,wind,tennis
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rainy,mild,high,weak,yes
4,rainy,cool,normal,weak,yes
5,rainy,cool,normal,strong,no
6,overcast,cool,normal,strong,yes
7,sunny,mild,high,weak,no
8,sunny,cool,normal,weak,yes
9,rain,mild,normal,weak,yes


## Step 1
- Implement an ID3 decision tree*.

In [47]:
def entropy(data_frame: pd.DataFrame, classification: str):
    entry_count = data_frame.shape[0]
    return sum((- (value / entry_count) * math.log2(value / entry_count)) for value in
               data_frame[classification].value_counts())

1.924174352300441

In [81]:
def find_split_column(data_frame: pd.DataFrame, classification: str):
    ent = entropy(data_frame, classification)
    e = dict()
    
    for column_name in data_frame.drop(classification, 1):
        value_counts: pd.Series = data_frame[column_name].value_counts(normalize=True)
        entropys = value_counts.index.to_series().apply(
            lambda x: entropy(data_frame[data_frame[column_name] == x],
                             classification))
        new_ent = (value_counts * entropys).sum()
        information_gain = ent - new_ent
        e[column_name] = information_gain

    best_column = max(e, key=e.get)
    return best_column

In [82]:
def id3(data_frame, classification: str, last_split_column: str="", check=None, matching=True):
    ent = entropy(data_frame, classification)
    if ent == 0:
        if check is None:
            return '-> ' + data_frame[classification].iloc[0]
#         elif matching:
#             return data_frame[classification].iloc[0]
#         else:
#             return None
        
    split_column = find_split_column(data_frame, classification)
    if split_column == last_split_column:
        if check is None:
            return str(data_frame[classification].value_counts(normalize=True))
#         else:
#             return data_frame[classification].value_counts(normalize=True)
    
    if check is None:
        result = split_column + ' ?'
#     else:
#         result = None
    print(pd.unique(data_frame[split_column]))
    for value in pd.unique(data_frame[split_column]):
        df = data_frame[data_frame[split_column] == value]

        if check is None:
            result += f'\n |{value}'
            result += f'\n{textwrap.indent(id3(df, classification, split_column), " |  ")}'
#         else:
#             value_matches = matching and split_column in check and check[split_column] == value
#             result = result or id3(df, classification, split_column, check, value_matches)
    return result

In [83]:
print(id3(data_frame, 'tennis'))

['sunny' 'overcast' 'rainy' 'rain']
['high' 'normal']
['weak' 'strong']
['normal' 'high']
forecast ?
 |sunny
 |  humidity ?
 |   |high
 |   |  -> no
 |   |normal
 |   |  -> yes
 |overcast
 |  -> yes
 |rainy
 |  wind ?
 |   |weak
 |   |  -> yes
 |   |strong
 |   |  -> no
 |rain
 |  humidity ?
 |   |normal
 |   |  -> yes
 |   |high
 |   |  -> no


## Step 2
- Use your decision tree to classify: rainy forecast, hot temperature, high humidity, strong wind

In [7]:
result = id3(data_frame, 'tennis', check={'forecast':'rainy',
                                        'temperature':'hot',
                                        'humidity': 'high',
                                        'wind': 'strong'})
print(result)

no
