In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("mushrooms.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

# x & Y

In [4]:
input_data=df.drop(columns='class')
x=pd.get_dummies(input_data,drop_first=True)

In [5]:
y=df['class']

# Train test split


In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=101)

# Gradient Boost classifier with default parameter

In [7]:
#modelling

from sklearn.ensemble import GradientBoostingClassifier
gb_model=GradientBoostingClassifier()
gb_model.fit(x_train,y_train)


#prediction 

train_predictions=gb_model.predict(x_train)
test_predictions=gb_model.predict(x_test)

#Evaluation

from sklearn.metrics import accuracy_score

train_r2=accuracy_score(y_train,train_predictions)
test_r2=accuracy_score(y_test,test_predictions)

print(train_r2,test_r2)

#cross val score
from sklearn.model_selection import cross_val_score
cvs=cross_val_score(gb_model,x,y,cv=5).mean()
cvs

0.9996922603477458 0.9993846153846154


0.9192312239484653

# Identify best parameters for gradient boost model using Hyperparameter Tunning

In [8]:
from sklearn.model_selection import GridSearchCV

#model/estimator

estimator=GradientBoostingClassifier()

#parameter grid

param_grid={"n_estimators":[1,5,10,20,40,100],"learning_rate":[0.1,0.2,0.3,0.5,0.8,1]}

#grid search
grid=GridSearchCV(estimator,param_grid,cv=5,scoring='accuracy')

#fit the data
grid.fit(x_train,y_train)

#best parameters for GradientBoost model

print(grid.best_params_)

grid.best_estimator_

{'learning_rate': 0.5, 'n_estimators': 100}


In [9]:
grid.best_estimator_.feature_importances_

array([1.40010839e-04, 0.00000000e+00, 0.00000000e+00, 5.24199854e-12,
       0.00000000e+00, 1.33890724e-03, 1.37783204e-05, 5.58361460e-08,
       2.86768840e-07, 0.00000000e+00, 0.00000000e+00, 6.65847675e-12,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.77091347e-05,
       3.92928225e-03, 4.95024990e-02, 1.66756714e-04, 3.50987357e-03,
       1.85945247e-02, 1.32175241e-05, 6.29332713e-01, 1.09554933e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.72000183e-03,
       2.67932007e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 7.98039183e-17, 0.00000000e+00,
       4.80076480e-14, 5.08437905e-09, 1.36039465e-01, 0.00000000e+00,
       1.64836225e-03, 1.02135304e-03, 7.63720367e-05, 2.39071790e-05,
       9.13053823e-07, 1.23337783e-07, 8.10416774e-02, 2.07309561e-13,
       0.00000000e+00, 0.00000000e+00, 6.57690847e-04, 0.00000000e+00,
      

In [11]:
feats=pd.DataFrame(index=x.columns,
                  data=grid.best_estimator_.feature_importances_,
                  columns=['Importance'])
important_features=feats[feats['Importance']>0.01].index.to_list()
important_features

['bruises_t',
 'odor_l',
 'odor_n',
 'odor_p',
 'stalk-root_c',
 'stalk-surface-below-ring_y',
 'spore-print-color_h',
 'spore-print-color_r']

In [29]:
imp_features_list=feats[feats['Importance']>0.01]
imp_features_list

Unnamed: 0,Importance
bruises_t,0.049502
odor_l,0.018595
odor_n,0.629333
odor_p,0.010955
stalk-root_c,0.136039
stalk-surface-below-ring_y,0.081042
spore-print-color_h,0.016558
spore-print-color_r,0.028989


# GradientBoost With best hyperparameter

In [30]:
x=x[imp_features_list]  #dropped unimportant
y=df['class']


x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=101)

#modelling
gb_hpt=GradientBoostingClassifier(n_estimators=40,learning_rate=0.5)
gb_hpt.fit(x_train,y_train)

#prediction

train_predictions=gb_hpt.predict(x_train)
test_predictions=gb_hpt.predict(x_test)

#Evaluation

from sklearn.metrics import accuracy_score
train_r2=accuracy_score(y_train,train_predictions)
test_r2=accuracy_score(y_test,test_predictions)

#cross validation Score

cvs=cross_val_score(gb_hpt,x,y,cv=5).mean()
cvs

ValueError: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# Confusion Matrix


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,test_predictions)

# Classification Report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,test_predictions))

In [None]:
plt.figure(figsize=(14,6),dpi=200)

sns.barplot(data=important_features.sort_values('Importance'),
           x=important_features.index,
           y='Importance')
plt.xticks(rotation=90)
plt.show()