# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
from sklearn.metrics import classification_report,plot_confusion_matrix,accuracy_score
import cufflinks as cf
cf.go_offline

# Mushroom Hunting: Edible or Poisonous?

Attribute Information:

1. cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d

In [None]:
df = pd.read_csv('../input/mushroom-classification/mushrooms.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().transpose()

# Plots

In [None]:
plt.figure(dpi=125)
sns.countplot(data=df,x='class')

### Plot of number of unique categories in different columns

In [None]:
plt.figure(dpi=150,figsize=(12,8))
sns.barplot(data= df.describe().transpose().reset_index().sort_values('unique'),x='index',y='unique')
plt.xticks(rotation=90);

# Train Test split 

In [None]:
x = df.drop('class',axis=1)

In [None]:
y = df['class']

### Get dummy variables for categorical data so we can feed it into the model

In [None]:
x = pd.get_dummies(x,drop_first=True)

In [None]:
x.info()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

# AdaBoost model

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
len(x.columns)

### As there are 95 columns choose n_estimators hyperparamater in range(0,96).
#### which gives the least error, keep in mind that after reachine a threshold there wont be much decrease in the error
### For loop to check for differnet n_estimators and adding errors to the list

In [None]:
error_list = []

for i in range(1,96):
    model = AdaBoostClassifier(n_estimators=i)
    model.fit(x_train,y_train)
    predictions = model.predict(x_test)
    
    error = 1-accuracy_score(y_test, predictions)
    error_list.append(error)
    

In [None]:
import plotly.express as px
fig = px.line(x=range(1,96), y=error_list,labels=dict(x="n_estimators", y="error"))
fig.show()

### We can see that the error reaches it's minimum at 15 and maintains it, So I'm choosing n_estimators=15

In [None]:
new_model = AdaBoostClassifier(n_estimators=15)

In [None]:
new_model.fit(x_train,y_train)

In [None]:
predictions = new_model.predict(x_test)

In [None]:
new_model.feature_importances_

### The model is performing absolutely great

In [None]:
print(classification_report(y_test,predictions))

In [None]:

plot_confusion_matrix(new_model,x_train,y_train)

In [None]:
features = pd.DataFrame(index=x.columns,data=new_model.feature_importances_,columns=['Importance'])

### Features which the model considered important

In [None]:
good_features = features[features['Importance']>0]

In [None]:
good_features

In [None]:
plt.figure(figsize=(14,6),dpi=200)
sns.barplot(data=good_features.sort_values('Importance'),x=good_features.index,y='Importance')
plt.xticks(rotation=90);

# Now we can create a final model using all the data

In [None]:
final_model = AdaBoostClassifier(n_estimators=15)

In [None]:
final_model.fit(x,y)