# Data Munging

In [2]:
# Importing libraries 
import numpy as np 
import pandas as pd
import janitor

# !pip3 install pyjanitor # run this once

import sklearn 
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_score, accuracy_score, recall_score, f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

In [3]:
data = pd.read_csv('../data/mushrooms.csv')

In [4]:
data.info() # We want to see the data types and how many null values do we have in the dataset.  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [5]:
data.columns # We want to clean column names automatically by replacing each - with _

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [6]:
data = data.clean_names()

In [7]:
data.columns # Check column names after automatic cleaning

Index(['class', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [8]:
data.describe()

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [9]:
# We want to see what different values each column contains
# From here, we can see that the veil_type has one single value and therefore is redundant and not informative so we can proceed with dropping it 
# All the mushrooms in our dataset have partial veils, so the column veil_type is not informative. 
data.columns.tolist()
for col in data.columns.tolist(): 
    print(col,':  ',data[col].unique())

class :   ['p' 'e']
cap_shape :   ['x' 'b' 's' 'f' 'k' 'c']
cap_surface :   ['s' 'y' 'f' 'g']
cap_color :   ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']
bruises :   ['t' 'f']
odor :   ['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']
gill_attachment :   ['f' 'a']
gill_spacing :   ['c' 'w']
gill_size :   ['n' 'b']
gill_color :   ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
stalk_shape :   ['e' 't']
stalk_root :   ['e' 'c' 'b' 'r' '?']
stalk_surface_above_ring :   ['s' 'f' 'k' 'y']
stalk_surface_below_ring :   ['s' 'f' 'y' 'k']
stalk_color_above_ring :   ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
stalk_color_below_ring :   ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
veil_type :   ['p']
veil_color :   ['w' 'n' 'o' 'y']
ring_number :   ['o' 't' 'n']
ring_type :   ['p' 'e' 'l' 'f' 'n']
spore_print_color :   ['k' 'n' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
population :   ['s' 'n' 'a' 'v' 'y' 'c']
habitat :   ['u' 'g' 'm' 'd' 'p' 'w' 'l']


In [10]:
data.drop('veil_type', axis = 1, inplace = True)

In [11]:
# We can see that columns have multiple repetitive values (letters), even though they mean different things in each column. 
# This might cause some problems like duplications when we create dummy variables for these categorical values. 

# We can also see that the column 'stalk_root' has a non-alphanumeric value and it might need some munging. 
# According to the dataset's documentation, the value '?' in stalk_root means that they are missing or unknown stalk root data. 
# Let's see how many of these missing values we have to decide if it'd be okay to drop these rows. 

vals = data['stalk_root'].value_counts().index.values.tolist()

NA_count = data['stalk_root'].value_counts().values

NA_frac = data['stalk_root'].value_counts().to_list()
NA_frac = [i/sum(NA_frac) for i in NA_frac]

pd.DataFrame(zip(NA_count,NA_frac), columns=['Count','Fraction'], index= vals)

# So, now we can see that if we drop the missing values in this column we're losing 30% of our data which accounts for about 2500 instances. 
# Dropping the rows is not the best solution in this case. 
# Therefore, we'll try to impute using KNN.
# Before that, the categorical value must be numerically encoded/labelled from 0 to n. 


# ['population', 'cap_shape', 'stalk_shape', 'stalk_surface_above_ring', 'stalk_surface_below_ring', 'stalk_color_below_ring']

Unnamed: 0,Count,Fraction
b,3776,0.464796
?,2480,0.305268
e,1120,0.137863
c,556,0.068439
r,192,0.023634


In [12]:
# Let's see the order of values in this column
data.stalk_root.unique()

array(['e', 'c', 'b', 'r', '?'], dtype=object)

In [13]:
le = preprocessing.LabelEncoder()

for i in data.columns.tolist():
    data[i]= le.fit_transform(data[i])

data

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,2,7,7,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,2,7,7,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,2,7,7,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,2,7,7,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,2,7,7,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,2,5,5,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,2,5,5,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,2,5,5,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,2,1,7,7,2,1,0,7,4,2


In [14]:
# Order of values after encoding 
data['stalk_root'].unique()

array([3, 2, 1, 4, 0])

In [15]:
# But, for the models to impute the missing data, we should replace each 4 with a NaN. 
data.replace({'stalk_root': {0: np.nan}}).stalk_root.value_counts()

1.0    3776
3.0    1120
2.0     556
4.0     192
Name: stalk_root, dtype: int64

In [16]:
imputer = KNNImputer(missing_values = np.nan, n_neighbors=5, weights = 'distance')
imputer.fit_transform(data[['stalk_root']])

array([[3.],
       [2.],
       [2.],
       ...,
       [0.],
       [0.],
       [0.]])

In [17]:
data.stalk_root.value_counts()

1    3776
0    2480
3    1120
2     556
4     192
Name: stalk_root, dtype: int64

In [18]:
# We can see that KNNImputer didn't give us any useful results and we're again back on square 1. Therefore, we'll just drop the column. 
data.drop('stalk_root', axis= 1, inplace= True) # We might need to keep this for DataViz purposes not for modelling.

In [19]:
# Are the classes blanced within our data?  
vals = data['class'].value_counts().index.values.tolist()

class_count = data['class'].value_counts().values

class_frac = data['class'].value_counts().to_list()
class_frac = [round((i/sum(class_frac))*100, 2) for i in class_frac]

pd.DataFrame(zip(class_count,class_frac), columns=['Count','Fraction'], index= vals)

# They are "adequately" balanced and there's no need for any oversampling techniques. 

Unnamed: 0,Count,Fraction
0,4208,51.8
1,3916,48.2


In [20]:
# ss = StandardScaler()
# ss.fit(data.drop('class', axis=1))
# data = ss.transform(data.drop('class', axis=1))

In [21]:
# data.to_csv('../data/mushroom_clean.csv') 

# DataViz

In [22]:
# Descriptive Statistics
# Heat map, frequencies. 
# Relationship between habitat and poisonous, colors how informative are they in terms of edibility. 
# Visualizations

# Categorical Feature Engineering VV
# Target Engineering >> Label encoding 

# Baseline model

In [23]:
from copy import deepcopy
data_scaled = deepcopy(data)
data_scaled.drop('class', axis=1, inplace = True)

ss = StandardScaler()

ss.fit(data_scaled)
data_scaled = ss.transform(data_scaled)

In [24]:
data_scaled

array([[ 1.02971224,  0.14012794, -0.19824983, ..., -0.67019486,
        -0.5143892 ,  2.03002809],
       [ 1.02971224,  0.14012794,  1.76587407, ..., -0.2504706 ,
        -1.31310821, -0.29572966],
       [-2.08704716,  0.14012794,  1.37304929, ..., -0.2504706 ,
        -1.31310821,  0.86714922],
       ...,
       [-0.8403434 ,  0.14012794, -0.19824983, ..., -1.50964337,
        -2.11182722,  0.28570978],
       [-0.21699152,  0.95327039, -0.19824983, ...,  1.42842641,
         0.28432981,  0.28570978],
       [ 1.02971224,  0.14012794, -0.19824983, ...,  0.16925365,
        -2.11182722,  0.28570978]])

In [25]:
X = data_scaled
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [26]:
# Logistic Regression
lr = LogisticRegression(max_iter=500)

# training & prediction
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# evaluation
print('Confusion Matrix :\n', confusion_matrix(y_test, lr_pred))
print()
print('Classification Report :\n', classification_report(y_test, lr_pred))

Confusion Matrix :
 [[1017   49]
 [  66  899]]

Classification Report :
               precision    recall  f1-score   support

           0       0.94      0.95      0.95      1066
           1       0.95      0.93      0.94       965

    accuracy                           0.94      2031
   macro avg       0.94      0.94      0.94      2031
weighted avg       0.94      0.94      0.94      2031



# Feature Filtering

In [27]:
# Variance thresholding (0.1, 0.05) 
# 10% variance resulted in dropping 2 column only, while 5% didn't do us any good. 
# What were the columns dropped? 
def variance_threshold_selector(data, threshold=0.1):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data.columns[selector.get_support(indices=True)]

variance_threshold_cols = variance_threshold_selector(data.drop('class', axis = 1))
variance_threshold_cols # Threshold values 5% and 10% are a rule of thumb

Index(['cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_spacing', 'gill_size', 'gill_color', 'stalk_shape',
       'stalk_surface_above_ring', 'stalk_surface_below_ring',
       'stalk_color_above_ring', 'stalk_color_below_ring', 'ring_type',
       'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [28]:
X = data[variance_threshold_cols]
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [29]:
# Logistic Regression
lr = LogisticRegression(max_iter=500)

# training & prediction
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# evaluation
print('Confusion Matrix :\n', confusion_matrix(y_test, lr_pred))
print()
print('Classification Report :\n', classification_report(y_test, lr_pred))

Confusion Matrix :
 [[1017   52]
 [  67  895]]

Classification Report :
               precision    recall  f1-score   support

           0       0.94      0.95      0.94      1069
           1       0.95      0.93      0.94       962

    accuracy                           0.94      2031
   macro avg       0.94      0.94      0.94      2031
weighted avg       0.94      0.94      0.94      2031



In [30]:
# Chi-squared test for feature selection 



In [55]:
# Lasso 
# Start by splitting
X = data_scaled
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Select features from the model
sel_ = SelectFromModel(LogisticRegression(C=0.01, penalty='l1', solver='liblinear'))
sel_.fit(X_train, np.ravel(y_train,order='C'))
sel_.get_support()
X_train = pd.DataFrame(X_train)


# To view the set of selected features
selected_feat = X_train.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
np.sum(sel_.estimator_.coef_ == 0)))
print()

sel_col_l1 = list()
[sel_col_l1.append(data.columns[i]) for i in X_train.columns[sel_.get_support(indices=True)].tolist()]
print('Selected columns using Lasso:', '\n',sel_col_l1, '\n')

# Logistic Regression
lr = LogisticRegression(max_iter=500)

# training & prediction
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# evaluation
print('Confusion Matrix :\n', confusion_matrix(y_test, lr_pred))
print()
print('Classification Report :\n', classification_report(y_test, lr_pred))


total features: 20
selected features: 11
features with coefficients shrank to zero: 9

Selected columns using Lasso: 
 ['cap_shape', 'cap_color', 'bruises', 'gill_attachment', 'gill_spacing', 'gill_size', 'stalk_shape', 'stalk_surface_above_ring', 'stalk_surface_below_ring', 'stalk_color_below_ring', 'spore_print_color'] 

Confusion Matrix :
 [[991  47]
 [ 60 933]]

Classification Report :
               precision    recall  f1-score   support

           0       0.94      0.95      0.95      1038
           1       0.95      0.94      0.95       993

    accuracy                           0.95      2031
   macro avg       0.95      0.95      0.95      2031
weighted avg       0.95      0.95      0.95      2031



## Dimensionality reduction

In [28]:
# PCA 
# The reason why we do not apply PCA before training the baseline model is that PCA transforms features into the principal 
# components which loose their explainability. Yet, explainability is the main requirement for a baseline model.




# Model Development

In [29]:
# # Model dev 
# # Standard Scal
# Deep learning 

