In [None]:
'''
TODO:
- clean it up, write documentation for each step, link with IG calculation notebook

QUESTIONS:
- how to visualize dummy encoded decision tree?
- what is a proper way of creating out of sample df for test prediction?
- is cross val score enough for pipeline validation?
'''


In [112]:
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # statistical data visualization
%matplotlib inline

from scipy.stats import entropy

## Other
from pathlib import Path

In [113]:
## ML import
import category_encoders as ce
import graphviz 

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report


In [114]:
## Functions ##
###############

def weight_values(df):
    '''
    Constructs a dataframe containing normalized value counts for each 
    input dataframe's attribute.
    :df: pd.DataFrame
    :return: pd.DataFrame
    '''
    
    output = pd.DataFrame()

    for col in df.columns:
        coldf = pd.DataFrame()

        ## Get relative counts of values
        coldf[col] = df[col].value_counts(normalize=True)

        ## Construct hierarchical index on coldf
        coldf = pd.concat([coldf], keys=[col], names=['attribute','value'])
        
        coldf.rename(columns={col:'val_weights'}, inplace = True)
        
        ## Concat to output dataframe
        output = pd.concat([output, coldf])
    

    return output


def weight_segments(s_df, w_df, l_name = 'class'):
    '''
    Performs a segmentation of categorical attributes and calculates label weights for each attribute's value.
    :s_df: source dataframe pd.DataFrame()
    :w_df: dataframe containing value weights for each attribute pd.DataFrame()
    :l_name: label name str
    :return: pd.DataFrame() 
    '''
    
    class_w_df = pd.DataFrame()
    
    ## Perform segmentation
    ## For each attribute in weight dataframe
    for attribute in w_df.index.levels[0]:
        
            ## For each value in weight dataframe
            for val in w_df.loc[attribute].index:
                
                ## Calculate label weights
                col_class_weights = s_df.loc[s_df[attribute]==val][l_name].value_counts(normalize=True)
                ## Cast pd.Series into pd.DataFrame
                col_class_weights = pd.DataFrame(col_class_weights)
                ## Transpose pd.DataFrame
                col_class_weights = col_class_weights.T
                ## Set categorical values as index
                col_class_weights['value'] = val
                col_class_weights.set_index('value',inplace=True)
                ## Construct hierarchical index
                col_class_weights = pd.concat([col_class_weights], keys=[attribute, val], names=['attribute','value'])
                ## Join pd.DataFrame to output pd.Dataframe
                class_w_df = pd.concat([class_w_df,col_class_weights])

    return class_w_df

In [115]:
## Read dataset ##
##################

mushroom_set = pd.read_csv('mushrooms.csv')
mushroom_set.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [116]:
## Construct sub dataframes
weight_df = weight_values(mushroom_set)
segmented_df = weight_segments(mushroom_set, weight_df)

## Construct dataframe for entropy calculation
entropy_df = segmented_df.join(weight_df)
entropy_df.fillna(0, inplace = True)
entropy_df

Unnamed: 0_level_0,Unnamed: 1_level_0,p,e,val_weights
attribute,value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bruises,f,0.693345,0.306655,0.584441
bruises,t,0.184834,0.815166,0.415559
cap-color,n,0.446585,0.553415,0.281142
cap-color,g,0.439130,0.560870,0.226489
cap-color,e,0.584000,0.416000,0.184638
...,...,...,...,...
veil-color,w,0.493185,0.506815,0.975382
veil-color,o,0.000000,1.000000,0.011817
veil-color,n,0.000000,1.000000,0.011817
veil-color,y,1.000000,0.000000,0.000985


In [117]:
## Calculate parent entropy parameter
parent_entropy = entropy(
    [entropy_df.loc[('class','p')]['val_weights'], 
     entropy_df.loc[('class','e')]['val_weights']],
    base=2
    )

In [118]:
## Calculate normalized entropy for each value in the segment
entropy_df['segment_entropy'] =  entropy([entropy_df['p'], entropy_df['e']], base=2)
entropy_df['weighted_entropy'] = entropy_df['val_weights'] * entropy_df['segment_entropy']
entropy_df

Unnamed: 0_level_0,Unnamed: 1_level_0,p,e,val_weights,segment_entropy,weighted_entropy
attribute,value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bruises,f,0.693345,0.306655,0.584441,0.889275,0.519729
bruises,t,0.184834,0.815166,0.415559,0.690539,0.286960
cap-color,n,0.446585,0.553415,0.281142,0.991752,0.278823
cap-color,g,0.439130,0.560870,0.226489,0.989283,0.224062
cap-color,e,0.584000,0.416000,0.184638,0.979544,0.180861
...,...,...,...,...,...,...
veil-color,w,0.493185,0.506815,0.975382,0.999866,0.975251
veil-color,o,0.000000,1.000000,0.011817,0.000000,0.000000
veil-color,n,0.000000,1.000000,0.011817,0.000000,0.000000
veil-color,y,1.000000,0.000000,0.000985,0.000000,0.000000


In [119]:
## Construct a list of attribute, information gain tuples
ig_list = [(attribute, parent_entropy - entropy_df.loc[attribute]['weighted_entropy'].sum()) # calculate information gain
           for attribute 
           in entropy_df.index.levels[0] 
           if len(entropy_df.loc[attribute]) > 1 and attribute != 'class'] # exclude segments with one item and labels

In [120]:
## Construct information gain dataframe
ig_df = pd.DataFrame(ig_list,
            columns=['Attribute', 'Information Gain'])

ig_df.set_index('Attribute', inplace = True)
ig_df.sort_values(by='Information Gain',ascending=False)

Unnamed: 0_level_0,Information Gain
Attribute,Unnamed: 1_level_1
odor,0.906075
spore-print-color,0.480705
gill-color,0.416978
ring-type,0.318022
stalk-surface-above-ring,0.284726
stalk-surface-below-ring,0.271894
stalk-color-above-ring,0.253845
stalk-color-below-ring,0.241416
gill-size,0.230154
population,0.201958


In [212]:
features_to_use = 'stalk-shape'.split()

In [213]:
## Create a feature and class frames
X = mushroom_set[features_to_use]
y = mushroom_set['class']

X_train, X_out, y_train, y_out = train_test_split(X, y, test_size = 0.0005, random_state = 42)

In [214]:
y_out

1971    e
6654    p
5606    p
3332    e
6988    p
Name: class, dtype: object

In [215]:
X.isna().sum()

stalk-shape    0
dtype: int64

In [216]:
y.isna().sum()

0

In [217]:
## Instantiate One Hot Encoder
ohe = OneHotEncoder(handle_unknown='ignore')

In [218]:
column_trans = make_column_transformer(
    (ohe,
    features_to_use),
    remainder='passthrough')

In [219]:
clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=42)

In [220]:
from sklearn.pipeline import make_pipeline

In [221]:
pipe = make_pipeline(column_trans, clf_en)

In [222]:
## Lets evaluate our model
from sklearn.model_selection import cross_val_score

In [223]:
cross_val_score(pipe, X_train, y_train, cv=6, scoring='accuracy').mean()

0.5530232977903835

In [224]:
y_train.value_counts(normalize=True)

e    0.518044
p    0.481956
Name: class, dtype: float64

In [209]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['odor',
                                                   'spore-print-color'])])),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(criterion='entropy', max_depth=4,
                                        random_state=42))])

In [210]:
pipe.predict(X_out)

array(['e', 'p', 'p', 'e', 'p'], dtype=object)

In [211]:
y_out

1971    e
6654    p
5606    p
3332    e
6988    p
Name: class, dtype: object