# Mushroom Classification

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve

### Attribute Information: 

classes: edible=e, poisonous=p

cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

bruises: bruises=t,no=f

odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

gill-attachment: attached=a,descending=d,free=f,notched=n

gill-spacing: close=c,crowded=w,distant=d

gill-size: broad=b,narrow=n

gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

stalk-shape: enlarging=e,tapering=t

stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

veil-type: partial=p,universal=u

veil-color: brown=n,orange=o,white=w,yellow=y

ring-number: none=n,one=o,two=t

ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

## Data Analysis

In [None]:
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
data.head()

In [None]:
data.shape

In [None]:
ax = sns.countplot(x='class', data=data, palette = "rocket")

In [None]:
figs, axes = plt.subplots(22, 1, figsize=(15,150))
for i in range(1, 23):
    ax = sns.countplot(data=data, x=data.iloc[:,i], hue='class', ax=axes[i-1], palette='rocket')

## Data Preprocessing

In [None]:
data.info()

Convering object data to integers

In [None]:
le = preprocessing.LabelEncoder()
for col in data.columns:
    data[col] = le.fit_transform(data[col].values)

In [None]:
data.head()

In [None]:
data.corrwith(data['class']).drop('class').sort_values(ascending=False)

In [None]:
data['veil-type'].value_counts()

veil-type has only one value, so we will drop that column

In [None]:
data.drop('veil-type', axis=1, inplace=True)

## Modeling

In [None]:
y = data['class']
X = data.drop(['class'], axis=1)

In [None]:
score_list = []
for classifier in {RandomForestClassifier(), LogisticRegression(), KNeighborsClassifier(), XGBClassifier(), GaussianNB(), DecisionTreeClassifier()}:
    score_list.append(cross_val_score(classifier, X, y, cv=StratifiedKFold(shuffle=True, n_splits=10)).mean())

In [None]:
models = ['RandomForestClassifier', 'LogisticRegression', 'KNeighborsClassifier', 'XGBClassifier', 'GaussianNB', 'DecisionTreeClassifier']

In [None]:
figs, axes = plt.subplots(figsize=(12,5))
ax = sns.barplot(x=models, y=score_list, palette = "rocket").set_title('Accuracy Scores')

In [None]:
accuracy_df = pd.DataFrame(columns = [models, score_list])
accuracy_df

### RoC Curve for RandomForestClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=88, shuffle=True)
Rf = RandomForestClassifier()
Rf.fit(X_train, y_train)

In [None]:
y_prob = Rf.predict_proba(X_test)
y_prob = y_prob[:,1]
FPR, TPR, Threshold = roc_curve(y_test, y_prob)

In [None]:
plt.plot(FPR, TPR)
plt.xlabel('FPR')
plt.ylabel('TPR')