Calling necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, accuracy_score, make_scorer, classification_report

from joblib import dump,load


import warnings
warnings.filterwarnings('ignore')

Uploading data

In [2]:
df = pd.read_csv(r"C:\Users\Saidabrorkhon\Downloads\archive\mushrooms.csv")

Data exploration

In [3]:
# overall look at the data
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# looking through data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [5]:
# checking missing values
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [6]:
# checking unique values
df.nunique()

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

Data Preprocessing

In [7]:
# encoding

cat_col = df.select_dtypes(include='object').columns
le = LabelEncoder()

for col in cat_col:
 cardinality = df[col].nunique()
 if cardinality >= 4:
    df[col] = le.fit_transform(df[col])
 else:
    df = pd.get_dummies(df, columns=[col], dtype=int, drop_first=True)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   cap-shape                 8124 non-null   int64
 1   cap-surface               8124 non-null   int64
 2   cap-color                 8124 non-null   int64
 3   odor                      8124 non-null   int64
 4   gill-color                8124 non-null   int64
 5   stalk-root                8124 non-null   int64
 6   stalk-surface-above-ring  8124 non-null   int64
 7   stalk-surface-below-ring  8124 non-null   int64
 8   stalk-color-above-ring    8124 non-null   int64
 9   stalk-color-below-ring    8124 non-null   int64
 10  veil-color                8124 non-null   int64
 11  ring-type                 8124 non-null   int64
 12  spore-print-color         8124 non-null   int64
 13  population                8124 non-null   int64
 14  habitat                   8124 non-null 

In [9]:
# scaling
# dropping target column so that it does not get scaled, because if it does, values become regression form which we dont want
scaler = StandardScaler()
x = df.drop(columns=['odor'])
y = df['odor']

x_scaled = scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)

df = pd.concat([x_scaled, y], axis=1)

In [10]:
df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,gill-color,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,...,habitat,class_p,bruises_t,gill-attachment_f,gill-spacing_w,gill-size_n,stalk-shape_t,ring-number_o,ring-number_t,odor
0,1.029712,0.140128,-0.19825,-0.228998,1.78146,0.683778,0.586385,0.622441,0.631991,0.142037,...,2.030028,1.036613,1.185917,0.162896,-0.438864,1.494683,-1.144806,0.291438,-0.282391,6
1,1.029712,0.140128,1.765874,-0.228998,0.838989,0.683778,0.586385,0.622441,0.631991,0.142037,...,-0.29573,-0.96468,1.185917,0.162896,-0.438864,-0.669038,-1.144806,0.291438,-0.282391,0
2,-2.087047,0.140128,1.373049,0.053477,0.838989,0.683778,0.586385,0.622441,0.631991,0.142037,...,0.867149,-0.96468,1.185917,0.162896,-0.438864,-0.669038,-1.144806,0.291438,-0.282391,3
3,1.029712,0.95327,1.373049,0.053477,1.78146,0.683778,0.586385,0.622441,0.631991,0.142037,...,2.030028,1.036613,1.185917,0.162896,-0.438864,1.494683,-1.144806,0.291438,-0.282391,6
4,1.029712,0.140128,-0.591075,-0.228998,1.78146,0.683778,0.586385,0.622441,0.631991,0.142037,...,-0.29573,-0.96468,-0.84323,0.162896,2.278612,-0.669038,0.873511,0.291438,-0.282391,5


Model Training

In [11]:
x = df.drop(columns=['odor'])
y = df['odor']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
x_train.shape, x_val.shape, x_test.shape

((6499, 22), (813, 22), (812, 22))

In [13]:
y_train.shape, y_val.shape, y_test.shape

((6499,), (813,), (812,))

In [14]:
model = DecisionTreeClassifier()
dt_model = model.fit(x_train, y_train)
dt_model

In [15]:
y_pred = dt_model.predict(x_val)
c_report = classification_report(y_val, y_pred)
print(c_report)

              precision    recall  f1-score   support

           0       0.09      0.13      0.11        38
           1       1.00      1.00      1.00        18
           2       0.66      0.78      0.71       219
           3       0.08      0.06      0.07        53
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00       351
           6       1.00      1.00      1.00        26
           7       0.00      0.00      0.00        44
           8       0.00      0.00      0.00        63

    accuracy                           0.71       813
   macro avg       0.54      0.55      0.54       813
weighted avg       0.67      0.71      0.69       813



In [16]:
param_dist = {
    'num_leaves': [15, 31, 63, 127],                   
    'max_depth': [-1, 5, 10, 20, 30],                   
    'learning_rate': [0.01, 0.05, 0.1, 0.2],            
    'n_estimators': [100, 200, 500],                    
    'min_child_samples': [10, 20, 50, 100],             
    'subsample': [0.6, 0.8, 1.0],                       
    'colsample_bytree': [0.6, 0.8, 1.0]                 
}


acc_scorer = make_scorer(accuracy_score, greater_is_better=True)
f1_scorer = make_scorer(f1_score, greater_is_better=True)

In [17]:
model = LGBMClassifier()

random_search = RandomizedSearchCV(
  estimator = model,
  param_distributions=param_dist,
  n_iter=20,
  scoring={'acc_score': acc_scorer, 'f1_score': f1_scorer},
  n_jobs=-1,
  random_state=42,
  cv=5,
  refit='f1_score'
)

In [18]:
lgbm = random_search.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002481 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 132
[LightGBM] [Info] Number of data points in the train set: 6499, number of used features: 22
[LightGBM] [Info] Start training from score -3.011083
[LightGBM] [Info] Start training from score -3.735978
[LightGBM] [Info] Start training from score -1.323527
[LightGBM] [Info] Start training from score -3.052556
[LightGBM] [Info] Start training from score -5.345416
[LightGBM] [Info] Start training from score -0.830312
[LightGBM] [Info] Start training from score -3.486099
[LightGBM] [Info] Start training from score -2.624546
[LightGBM] [Info] Start training from score -2.663511


In [19]:
y_pred = lgbm.predict(x_val)
c_report_lgbm = classification_report(y_val, y_pred)
print(c_report_lgbm)

              precision    recall  f1-score   support

           0       0.08      0.11      0.09        38
           1       1.00      1.00      1.00        18
           2       0.74      0.78      0.76       219
           3       0.13      0.09      0.11        53
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00       351
           6       1.00      1.00      1.00        26
           7       0.00      0.00      0.00        44
           8       0.00      0.00      0.00        63

    accuracy                           0.71       813
   macro avg       0.55      0.55      0.55       813
weighted avg       0.70      0.71      0.70       813



In [20]:
model = random_search
dump(model, 'mushroom.joblib')

['mushroom.joblib']

In [21]:
model

<function f1_score at 0x000002C2E7A65F80>


In [24]:
from tabulate import tabulate

In [30]:
headers=['model','f1_score','precision','accuracy_score']
data = [
  ['DecisionTreeClassifier', 54, 55, 71],
  ['LighGBM',55,55,71]
]

results = tabulate(data, headers=headers, tablefmt='fancy_grid')
print(results)

╒════════════════════════╤════════════╤═════════════╤══════════════════╕
│ model                  │   f1_score │   precision │   accuracy_score │
╞════════════════════════╪════════════╪═════════════╪══════════════════╡
│ DecisionTreeClassifier │         54 │          55 │               71 │
├────────────────────────┼────────────┼─────────────┼──────────────────┤
│ LighGBM                │         55 │          55 │               71 │
╘════════════════════════╧════════════╧═════════════╧══════════════════╛
