In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Edible and Poisonous Mushrooms Dataset

* In mushroom dataset total 22 features are available and all of them are categorical.

### Independent Features are:
* 1. Cap-Shape- bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
* 2. Cap-surface- fibrous=f,grooves=g,scaly=y,smooth=s
* 3. Cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
* 4. Bruises: bruises=t,no=f
* 5. Odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
* 6. Gill-attachment: attached=a,descending=d,free=f,notched=n
* 7. Gill-spacing:close=c,crowded=w,distant=d
* 8. Gill-size:broad=b,narrow=n
* 9. Gill-color: broad=b,narrow=n
* 10. Stalk-shape: enlarging=e,tapering=t
* 11.Stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
* 12.Stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s


* 13.Stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
* 14.Stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* 15.Stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* 16.Veil-type: partial=p,universal=u
* 17. Veil-color:brown=n,orange=o,white=w,yellow=y
* 18.Ring-number: none=n,one=o,two=t
* 19.Ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
* 20.Spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
* 21.Population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
* 22.Habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

### Dependent Features is:
* 1. class:poisonous=p, edible=e

# Things to do
* 1. Uploading dataset
* 2. Exploratory Data Analysis
* 3. Hyperparameter optimization
*  a. RandomForestClassifier
*  b. XGBClassifier
* 3. Model deployment using:
*  a. RandomForestClassifier
*  b. XGBClassifier      

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV



%matplotlib inline

# Uploading Dataset

In [None]:
df=pd.read_csv("/kaggle/input/mushroom-classification/mushrooms.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(df['class'])
plt.title("Distribution of edible and poisonous mushrooms\n p=poisonous, e=edible")
plt.show()

In [None]:
df['class'].value_counts()

## No Missing value found
## No imbalanced dataset

In [None]:
df.columns

In [None]:
# Exploratory Data Analysis

In [None]:
cat_feature=['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor']

In [None]:
fig, ax=plt.subplots(3, 2, figsize=(25,15))
for i in range(5):
    sns.countplot(cat_feature[i], hue='class', data=df, ax=ax[i//2][i%2])
    ax[i//2][i%2].set_title(cat_feature[i], fontsize=14)
    
fig.suptitle("Disribution of Cap-Features in non-poisnous and poisnous mushrooms", fontsize=20)
plt.show()

In [None]:
fig,ax=plt.subplots(1,4,figsize=(20,5))
sns.countplot('gill-attachment', hue='class', data=df, ax=ax[0])
ax[0].set_title('gill-attachment')

sns.countplot('gill-spacing', hue='class', data=df, ax=ax[1])
ax[1].set_title('gill-spacing')

sns.countplot('gill-size',hue='class', data=df, ax=ax[2])
ax[2].set_title('gill-size')

sns.countplot('gill-color',hue='class', data=df, ax=ax[3])
ax[3].set_title('gill-color')
fig.suptitle("Distribution of gill features in poisnous and non-poisnous mushrooms", fontsize=20)

plt.show()

In [None]:
features=['stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color']

In [None]:
fig, ax=plt.subplots(4,3, figsize=(20,25))
for i in range(11):
    sns.countplot(features[i], hue='class',data=df, ax=ax[i//3][i%3])
    ax[i//3][i%3].set_title(features[i], fontsize=14)
plt.suptitle("Distribution of features in poisnous and non-poisnous features", fontsize=14)
plt.show()

In [None]:
fig, ax=plt.subplots(1,2, figsize=(20,8))

sns.countplot('population', hue='class',data=df, ax=ax[0])
ax[0].set_title('population')

sns.countplot('habitat', hue='class', data=df, ax=ax[1])
ax[1].set_title('habitat')

plt.suptitle("Role of population and habitat in mushrooms", fontsize=14)
plt.show()

In [None]:
columns=df.columns.values
for column in columns:
    le=LabelEncoder()
    df[column]=le.fit_transform(df[column])
df.head()

In [None]:
X=df.drop(['class'], axis=1)
y=df['class']

In [None]:
parameters={'min_samples_leaf':[20,25]}
forest=RandomForestClassifier(max_depth=15, n_estimators=15)
grid=GridSearchCV(forest, parameters, cv=3, n_jobs=-1, verbose=2, scoring=make_scorer(roc_auc_score))

In [None]:
grid.fit(X,y)

In [None]:
grid.best_estimator_


In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
top_features=10

In [None]:
imp_feature=grid.best_estimator_.feature_importances_
idx=np.argsort(imp_feature)[::-1][0:top_features]
feature_name=df.drop('class', axis=1).columns.values

In [None]:
plt.figure(figsize=(25,5))
sns.barplot(x=feature_name[idx], y=imp_feature[idx])
plt.title("Top ten important features that diffrentiate between poisnous and non-poisnous mushrooms")
plt.show()

# Hyperparameter optimization

* RandomForestClassifier
* XGBClassifier

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2, random_state=0)

# 1. RandomForestClassifier


In [None]:
rf=RandomForestClassifier()
rf_param={"n_estimators": list(range(100,500)),
         "criterion":['gini','entropy'],
         "max_depth": list(range(1,10)),
         "max_samples":list(range(1,10))}
random_search=RandomizedSearchCV(rf, rf_param, n_jobs=-1, n_iter=10, scoring='accuracy', verbose=2)

In [None]:
random_search.fit(X,y)

In [None]:
print("Random Forest Classifier Best estimator is :", random_search.best_estimator_)
print("Random Forest Classifier Best parameter is :", random_search.best_params_)
print("Random Forest Classifier Best score is :", random_search.best_score_)
print("Random Forest Classifier Best index is :", random_search.best_index_)

In [None]:
rf=RandomForestClassifier(n_estimators=195, max_samples= 3, max_depth= 7, criterion='gini')
rf.fit(X_train,y_train)  

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
print("Accuracy score for Random Forest Classifier is:",round(accuracy_score(rf.predict(X_test), y_test)*100,2),'%')

# 2.XGBClassifier

In [None]:
xgb_param={"n_estimators":list(range(100, 500)),
          "max_depth":list(range(1,10)) ,
          "learning_rate":[0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008,0.009, 0.05, 0.09] ,
          "min_child_weight":list(range(1,10))}

In [None]:
xgb=XGBClassifier()
random_search=RandomizedSearchCV(xgb, xgb_param, n_iter=10, n_jobs=-1, cv=5, verbose=2)
random_search.fit(X,y)

In [None]:
print("XGBClassifier best estimator is :", random_search.best_estimator_)
print("XGBClassifier best parameters is :", random_search.best_params_)
print("XGBClassifier best score is :", random_search.best_score_)
print("XGBClassifer best index is", random_search.best_index_)

In [None]:
xgb=XGBClassifier(n_estimators=147, min_child_weight= 4, max_depth=1, learning_rate=0.05)
xgb.fit(X_train,y_train)

In [None]:
print("Accuracy score for XGB Classifier is:", round(accuracy_score(xgb.predict(X_test), y_test)*100,2),"%")