In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,recall_score,f1_score,recall_score

%matplotlib inline

In [29]:
dataset = pd.read_csv('D:/14_My Practice/Datasets/mushrooms.csv')
dataset.head(6)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g


In [30]:
dataset.describe(include='all').T

Unnamed: 0,count,unique,top,freq
class,8124,2,e,4208
cap-shape,8124,6,x,3656
cap-surface,8124,4,y,3244
cap-color,8124,10,n,2284
bruises,8124,2,f,4748
odor,8124,9,n,3528
gill-attachment,8124,2,f,7914
gill-spacing,8124,2,c,6812
gill-size,8124,2,b,5612
gill-color,8124,12,b,1728


In [31]:
dataset.isna().sum().sort_values(ascending=True)

class                       0
spore-print-color           0
ring-type                   0
ring-number                 0
veil-color                  0
veil-type                   0
stalk-color-below-ring      0
stalk-color-above-ring      0
stalk-surface-below-ring    0
stalk-surface-above-ring    0
population                  0
stalk-root                  0
gill-color                  0
gill-size                   0
gill-spacing                0
gill-attachment             0
odor                        0
bruises                     0
cap-color                   0
cap-surface                 0
cap-shape                   0
stalk-shape                 0
habitat                     0
dtype: int64

In [32]:
dataset['class'].value_counts()/len(dataset)*100

e    51.797144
p    48.202856
Name: class, dtype: float64

In [33]:
dataset['class'].value_counts(normalize=True)*100

e    51.797144
p    48.202856
Name: class, dtype: float64

In [34]:
#creating all categorical variables to binary varibles using dummy variables
dataset = pd.get_dummies(dataset,drop_first=True)
dataset.head()

Unnamed: 0,class_p,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [35]:
X = dataset.drop('class_p',axis=1)
y = dataset['class_p']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [40]:
model = DecisionTreeClassifier(max_depth=1,criterion='entropy')
Adaboost = AdaBoostClassifier(base_estimator=model,learning_rate=1,n_estimators=300)
boost_model = Adaboost.fit(X_train,y_train)
y_pred = boost_model.predict(X_test)

In [41]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1040
           1       1.00      1.00      1.00       991

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031



In [42]:
print(accuracy_score(y_test,y_pred))

1.0


In [43]:
print(recall_score(y_test,y_pred))

1.0


In [44]:
print(confusion_matrix(y_test,y_pred))

[[1040    0]
 [   0  991]]


In [45]:
scale = StandardScaler()
X_train_scaled = scale.fit_transform(X_train)
X_test_scaled = scale.transform(X_test)

In [46]:
rf_model = RandomForestClassifier()
# choosing hyperparameters for grid search cv
params = {'n_estimators':[70,90,100,110,120],'criterion' : ["gini", "entropy"],'max_depth':[3,6,9,None],'max_features' : ["auto", "sqrt", "log2"]}
# choosing scoring metrics
scoring = ['accuracy','precision','recall','f1']

In [48]:
grid_model =GridSearchCV(estimator=rf_model,param_grid=params,scoring=scoring,cv=5,refit='recall')

In [49]:
grid_model.fit(X_train_scaled,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 6, 9, None],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [70, 90, 100, 110, 120]},
             refit='recall', scoring=['accuracy', 'precision', 'recall', 'f1'])

In [None]:
grid_model.best_score_

In [None]:
grid_model.best_params_