In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
df.shape

(8124, 23)

In [4]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [5]:
df.duplicated().sum()

0

In [6]:
# !pip install pandas-profiling

In [7]:
#from pandas_profiling import ProfileReport
#prof = ProfileReport(df)
#prof.to_file(output_file= 'output_mushrooms.html')

In [8]:
df = df.drop(columns=['veil-type'])
df.sample(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
6264,p,f,s,n,f,y,f,c,n,b,...,s,k,p,p,w,o,e,w,v,d
557,e,x,f,w,t,l,f,w,n,w,...,s,s,w,w,w,o,p,u,v,d
1618,e,x,s,n,f,n,f,w,b,h,...,s,s,w,w,w,o,e,n,s,g
4417,e,f,y,n,t,n,f,c,b,u,...,s,s,w,g,w,o,p,k,v,d
1374,p,x,y,n,t,p,f,c,n,p,...,s,s,w,w,w,o,p,k,s,g


In [9]:
df = pd.get_dummies(df,columns=['cap-shape','cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-color','ring-number','ring-type','spore-print-color','population','habitat'], drop_first=True)
df.sample(5)

Unnamed: 0,class,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
2349,e,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2233,e,0,1,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2816,e,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7442,p,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
5994,p,0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0


In [10]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [11]:
X

Unnamed: 0,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,cap-color_e,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8120,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
8121,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [12]:
y

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

In [13]:
le = LabelEncoder()

y = le.fit_transform(y)

In [14]:
y

array([1, 0, 0, ..., 0, 1, 0])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=3)

In [16]:
print(X_train.shape)
print(X_test.shape)

(5686, 95)
(2438, 95)


In [17]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

1.0

In [18]:
confusion_matrix(y_test,y_pred)

array([[1279,    0],
       [   0, 1159]], dtype=int64)

# GridSearchCV

In [19]:
# Number of trees in random forest
n_estimators = [50,75,100,120]

# Number of features to consider at every split
max_features = [0.3,0.6,1.0]

# Maximum number of levels in tree
max_depth = [3,7,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

In [20]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples
             }

In [21]:
rf = RandomForestClassifier()

In [22]:
rf_grid = GridSearchCV(estimator = rf, 
                       param_grid = param_grid, 
                       cv = 5, 
                       n_jobs = -1)

In [23]:
rf_grid.fit(X_train,y_train)

 0.98346804 0.98294126        nan        nan        nan        nan
 0.9864579  0.9864579  0.98645775 0.9864579  0.98505054 0.98645775
 0.98593004 0.9859302         nan        nan        nan        nan
 0.98557871 0.98628231 0.98557871 0.98593051 0.98575461 0.98610641
 0.98575461 0.9852269         nan        nan        nan        nan
 0.9998241  0.9996482  0.9998241  0.9996482  1.         1.
 1.         0.9998241         nan        nan        nan        nan
 0.9998241  0.9994723  0.9998241  0.9998241  1.         1.
 0.9998241  0.9998241         nan        nan        nan        nan
 0.9996482  0.9996482  0.9996482  0.9996482  0.9998241  0.9998241
 0.9998241  0.9996482         nan        nan        nan        nan
 1.         0.9998241  1.         1.         1.         1.
 1.         1.                nan        nan        nan        nan
 0.9998241  0.9996482  0.9998241  1.         1.         1.
 1.         1.                nan        nan        nan        nan
 0.9996482  0.9996482  0.999

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [3, 7, None],
                         'max_features': [0.3, 0.6, 1.0],
                         'max_samples': [0.5, 0.75, 1.0],
                         'n_estimators': [50, 75, 100, 120]})

In [24]:
rf_grid.best_score_

1.0

In [25]:
rf_grid.best_params_

{'max_depth': 7, 'max_features': 0.3, 'max_samples': 0.75, 'n_estimators': 50}

In [26]:
model = RandomForestClassifier(max_depth= 7, max_features=0.3, max_samples= 0.75, n_estimators= 75)
model.fit(X_train,y_train)

RandomForestClassifier(max_depth=7, max_features=0.3, max_samples=0.75,
                       n_estimators=75)

In [27]:
y_pred = model.predict(X_test)
accuracy_score(y_pred, y_test)

1.0

In [28]:
confusion_matrix(y_pred, y_test)

array([[1279,    0],
       [   0, 1159]], dtype=int64)