In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import Image
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [None]:
Image(url = 'https://natureecoevocommunity.nature.com/cdn-cgi/image/quality=90/https://images.zapnito.com/uploads/ebb294a8195aad356c35dcc9320b2e8d/bc84ba5c-7f0c-44fc-a79d-1ee934e1875a.jpeg', width=600)

Mushroom hunting, mushrooming, mushroom picking, mushroom foraging, and similar terms describe the activity of gathering mushrooms in the wild, typically for culinary use. This practice is popular throughout most of Europe, Australia, Japan, Korea, parts of the Middle East, and the Indian subcontinent, as well as the temperate regions of Canada and the United States.


In [None]:
Image(url = 'https://www.anatomynote.com/wp-content/uploads/2018/09/3937/Mushroom-anatomical-structure.gif',width=800,height=400)


- **classes**: edible=e, poisonous=p
- **cap-shape**: bell=b,conical=c, convex=x, flat=f, knobbed=k,sunken=s
- **cap-surface**: fibrous=f, grooves=g, scaly=y, smooth=s
- **cap-color**:  brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y
- **bruises**: bruises=t, no=f
- **odor**: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s
- **gill-attachment**: attached=a, descending=d, free=f, notched=n
- **gill-spacing**: close=c, crowded=w, distant=d
- **gill-size**: broad=b, narrow=n
- **gill-color**: black=k, brown=n, buff=b, chocolate=h, gray=g, green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y
- **stalk-shape**: enlarging=e, tapering=t
- **stalk-root**: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=?
- **stalk-surface-above-ring**: fibrous=f, scaly=y, silky=k, smooth=s
- **stalk-surface-below-ring**: fibrous=f, scaly=y, silky=k, smooth=s
- **stalk-color-above-ring**: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
- **stalk-color-below-ring**: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
- **veil-type**: partial=p, universal=u
- **veil-color**: brown=n, orange=o, white=w, yellow=y
- **ring-number**: none=n, one=o, two=t
- **ring-type**: cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z
- **spore-print-color**: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y
- **population**: abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y
- **habitat**: grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d

In [None]:
# Read the dataset 
mushroom_df = pd.read_csv('../input/mushroom-classification/mushrooms.csv')

In [None]:
print('\033[1m', 'First five rows of the dataset.', '\033[0m')
mushroom_df.head()

In [None]:
mushroom_df.info()

In [None]:
print('\033[1m', 'Statistical summary of all the columns', '\033[0m')
mushroom_df.describe()

In [None]:
# Dealing with missing values in stalk-root column
mushroom_df['stalk-root'].replace('?', np.nan, inplace=True)
print('\033[1m', 'Value counts for stalk root before filling missing values: ', '\033[0m', mushroom_df['stalk-root'].value_counts(), sep='\n')
print('\033[1m', '\nNumber of null values: ', '\033[1m', mushroom_df['stalk-root'].isnull().sum())
mushroom_df['stalk-root'].fillna(mushroom_df['stalk-root'].mode()[0], inplace=True)
print('\033[1m', 'Value counts for stalk root after filling missing values with mode: ', '\033[0m', mushroom_df['stalk-root'].value_counts(), sep='\n')

In [None]:
view_df = mushroom_df.copy()

# Creating labels for each column
labels = {
    'class': {'e':'edible', 'p':'poisonous'},
    'cap-shape': {'b':'bell', 'c':'conical', 'x':'convex', 'f':'flat', 'k':'knobbed', 's':'sunken'},
    'cap-surface': {'f':'fibrous', 'g':'grooves', 'y':'scaly', 's':'smooth'},
    'cap-color': {'n':'brown', 'b':'buff', 'c':'cinnamon', 'g':'gray', 'r':'green', 'p':'pink', 'u':'purple', 'e':'red', 'w':'white', 'y':'yellow'},
    'bruises': {'t':'bruises', 'f':'no'},
    'odor': {'a':'almond', 'l':'anise', 'c':'creosote', 'y':'fishy', 'f':'foul', 'm':'musty', 'n':'none', 'p':'pungent', 's':'spicy'},
    'gill-attachment': {'a':'attached', 'f':'free', 'd':'descending', 'n':'notched'},
    'gill-spacing': {'c':'close', 'w':'crowded', 'd':'distant'},
    'gill-size': {'b':'broad', 'n':'narrow'},
    'gill-color': {'k':'black', 'n':'brown', 'b':'buff', 'h':'chocolate', 'g':'gray', 'r':'green', 'o':'orange', 'p':'pink', 'u':'purple', 'e':'red', 'w':'white', 'y':'yellow'},
    'stalk-shape': {'e':'enlarging', 't':'tapering'},
    'stalk-root': {'b':'bulbous', 'c':'club', 'u':'cup', 'e':'equal', 'z':'rhizomorphs', 'r':'rooted'},
    'stalk-surface-above-ring': {'f':'fibrous', 'y':'scaly', 'k':'silky', 's':'smooth'},
    'stalk-surface-below-ring': {'f':'fibrous', 'y':'scaly', 'k':'silky', 's':'smooth'},
    'stalk-color-above-ring': {'n':'brown', 'b':'buff', 'c':'cinnamon', 'g':'gray', 'o':'orange', 'p':'pink', 'e':'red', 'w':'white', 'y':'yellow'},
    'stalk-color-below-ring': {'n':'brown', 'b':'buff', 'c':'cinnamon', 'g':'gray', 'o':'orange', 'p':'pink', 'e':'red', 'w':'white', 'y':'yellow'},
    'veil-type': {'p':'partial', 'u':'universal'},
    'veil-color': {'n':'brown', 'o':'orange', 'w':'white', 'y':'yellow'},
    'ring-number': {'n':'none', 'o':'one', 't':'two'},
    'ring-type': {'c':'cobwebby', 'e':'evanescent', 'f':'flaring', 'l':'large', 'n':'none', 'p':'pendant', 's':'sheathing', 'z':'zone'},
    'spore-print-color': {'k':'black', 'n':'brown', 'b':'buff', 'h':'chocolate', 'r':'green', 'o':'orange', 'u':'purple', 'w':'white', 'y':'yellow'},
    'population': {'a':'abundant', 'c':'clustered', 'n':'numerous', 's':'scattered', 'v':'several', 'y':'solitary'},
    'habitat': {'g':'grasses', 'l':'leaves', 'm':'meadows', 'p':'paths', 'u':'urban', 'w':'waste', 'd':'woods'}
    }
view_df.replace(labels, inplace=True)

In [None]:
# Defining a function to plot countplots of various columns from the dataset
def plot_countplots(col):
    fig=plt.figure(figsize=(22, 10));
    plt.subplot(221);
    sns.countplot(x=col, data=view_df, order=view_df[col].value_counts().index, palette='summer');
    sns.set_style('whitegrid');
    sns.despine(left=True);
    plt.xlabel('{} of the Mushroom'.format(col.replace('-', ' ').capitalize()), fontsize=16);
    plt.ylabel('');
    plt.title('Distribution of mushroom by {}'.format(col.replace('-', ' ')), fontsize=22, pad=10);
    plt.subplot(222);
    sns.countplot(x=col, data=view_df, hue='class', order=view_df[col].value_counts().index, palette='summer');
    plt.xlabel('{} of the Mushroom'.format(col.replace('-', ' ').capitalize()), fontsize=16);
    sns.set_style('whitegrid');
    sns.despine(left=True);
    plt.ylabel('');
    plt.title('Distribution of mushroom by {} and class'.format(col.replace('-', ' ')), fontsize=22, pad=10);
    plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right', fontsize=12);
    plt.show();

### Cap Shape

In [None]:
Image(url = 'https://www.usask.ca/biology/fungi/graphics/glossary_pictures/glossary_pic15',width=400,height=400)

In [None]:
plot_countplots('cap-shape')

- Convex and flats make up the majority of the cap shape of mushrooms. 
- Mushrooms with cap shape of bell appear to be more edible than other cap shapes.

### Cap surface

In [None]:
Image(url = 'https://www.usask.ca/biology/fungi/graphics/glossary_pictures/glossary_pic16' ,width=800,height=400)

For dataset;

Cap surface of scaly contains both raised scales and flat scales.

Grooves = Patches

In [None]:
plot_countplots('cap-surface')

### Cap color

In [None]:
plot_countplots('cap-color')

### Bruises

In [None]:
plot_countplots('bruises')

- Mushrooms with bruises are more edible than the not bruises mushrooms.

### Odor

In [None]:
plot_countplots('odor')

- According to the graph, none and foul-smelling mushrooms are common.
- According to the graph, the odor of the mushrooms will play an important role in distinguishing the class of the mushrooms.

### Gill attachment

In [None]:
Image(url = 'https://www.usask.ca/biology/fungi/graphics/glossary_pictures/glossary_pic19' ,width=800,height=400)

In [None]:
plot_countplots('gill-attachment')

- Appears to be unbalanced in terms of the gill attachment.
- Free gill attachments are almost half edible or poisonous.

### Gill spacing

In [None]:
Image(url = 'https://www.usask.ca/biology/fungi/graphics/glossary_pictures/glossary_pic17')

In [None]:
plot_countplots('gill-spacing')

- Appears to be unbalanced in terms of the gill spacing.
- Crowded gill spacing  more edible than the close gill spacing.

### Gill size

In [None]:
plot_countplots('gill-size')

- Narrow gill size looks more poisonous than the broad gill size

### Gill color

In [None]:
plot_countplots('gill-color')

- If Gill color is buff do not eat that mushroom.

### Stalk shape

In [None]:
plot_countplots('stalk-shape')

### Stalk root

In [None]:
Image(url = 'https://www.usask.ca/biology/fungi/graphics/glossary_pictures/glossary_pic21' ,width=800,height=400)

In [None]:
plot_countplots('stalk-root')

### Stalk surface below ring

In [None]:
plot_countplots('stalk-surface-below-ring')

- Stalk Surface with silky looks more poisonous  than than other types of Stalk Surface

### Stalk surface above ring

In [None]:
plot_countplots('stalk-surface-above-ring')

- Stalk Surface with silky looks more poisonous  than than other types of Stalk Surface

### Stalk color below ring

In [None]:
plot_countplots('stalk-color-below-ring')

- Stalk Color Below Ring with pink,brown and buff seems more poisonous.

### Stalk color above ring

In [None]:
plot_countplots('stalk-color-above-ring')

- Stalk Color Above Ring with pink,brown and buff seems more poisonous.

### Veil type

In [None]:
plot_countplots('veil-type')

### Veil color

In [None]:
plot_countplots('veil-color')

- Appears to be unbalanced in terms of veil color

### Ring number

In [None]:
plot_countplots('ring-number')

### Ring type

In [None]:
Image(url = 'https://www.usask.ca/biology/fungi/graphics/glossary_pictures/glossary_pic20' ,width=800,height=400)

In [None]:
plot_countplots('ring-type')

- Large ring type in mushroom seems dangerous.

### Spore print color

In [None]:
plot_countplots('spore-print-color')

- White and chocolate spore prints looks more poisonous than other colors.

### Population

In [None]:
plot_countplots('population')

### Habitat

In [None]:
plot_countplots('habitat')

# Preprocessing

**Label Encoder**
- Most of the sklearn functions expect that the data with number labels rather than word labels. Hence, we need to convert such labels into number labels. This process is called label encoding.

In [None]:
le = LabelEncoder()
for col in list(view_df.columns):
    view_df[col] = le.fit_transform(view_df[col])

In [None]:
print('\033[1m\033[4mDataset after encoding with Label Encoder\033[0m')
view_df

In [None]:
# Drop the target column
X = view_df.drop('class', axis=1)
y = view_df['class']
X.head()

**OneHotEncoder**

- Encode categorical integer features using a one-hot aka one-of-K scheme.
- The input to this transformer should be a matrix of integers, denoting the values taken on by categorical (discrete) features.
- The output will be a sparse matrix where each column corresponds to one possible value of one feature.
- It is assumed that input features take on values in the range [0, n_values).
- This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels.

In [None]:
# Encode the label encoded dataset
one_hot = OneHotEncoder()
X = one_hot.fit_transform(X).toarray()

In [None]:
# Return feature names for output features.
print('\033[1m\033[4m', 'Feature names after one hot encoding', '\033[0m')
one_hot.get_feature_names()

## Splitting the data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
print('\033[1m\033[4m Shape of training set features:\033[0m', X_train.shape, 
      '\033[1m\033[4m Shape of training set target:\033[0m', y_train.shape,
      '\033[1m\033[4m Shape of test set features:\033[0m', X_test.shape,
      '\033[1m\033[4m Shape of test set features:\033[0m', y_test.shape,
      sep='\n')

# Model comparison using cross validation

## Logistic Regression

In [None]:
clf = LogisticRegression().fit(X_train, y_train)
accuracy=cross_val_score(clf, X_train , y_train, cv=10, scoring='accuracy')
y_pred = clf.predict(X_test)
confusion = confusion_matrix(y_test,y_pred)
auc_roc = roc_auc_score(y_test,y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print('\033[1m','Cross val accuracy: ', '\033[0m', accuracy, '\033[1m', '\n\n', 'Mean of cross val accuracies: ', '\033[0m', np.mean(accuracy))
plt.figure(figsize=(15, 15))
plt.subplot(221)
sns.heatmap(confusion, cmap='summer', annot=True);
plt.subplot(222)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, color='green', label='AUC= %0.2f'%roc_auc);
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1],linestyle='--');
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate');

## GaussianNB

In [None]:
clf = GaussianNB().fit(X_train, y_train)
accuracy=cross_val_score(clf, X_train , y_train, cv=10, scoring='accuracy')
y_pred = clf.predict(X_test)
confusion = confusion_matrix(y_test,y_pred)
auc_roc = roc_auc_score(y_test,y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print('\033[1m','Cross val accuracy: ', '\033[0m', accuracy, '\033[1m', '\n\n', 'Mean of cross val accuracies: ', '\033[0m', np.mean(accuracy))
plt.figure(figsize=(15, 15))
plt.subplot(221)
sns.heatmap(confusion, cmap='summer', annot=True);
plt.subplot(222)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, color='green', label='AUC= %0.2f'%roc_auc);
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1],linestyle='--');
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate');

## Decision Tree Classifier

In [None]:
clf = DecisionTreeClassifier().fit(X_train, y_train)
accuracy=cross_val_score(clf, X_train , y_train, cv=10, scoring='accuracy')
y_pred = clf.predict(X_test)
confusion = confusion_matrix(y_test,y_pred)
auc_roc = roc_auc_score(y_test,y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print('\033[1m','Cross val accuracy: ', '\033[0m', accuracy, '\033[1m', '\n\n', 'Mean of cross val accuracies: ', '\033[0m', np.mean(accuracy))
plt.figure(figsize=(15, 15))
plt.subplot(221)
sns.heatmap(confusion, cmap='summer', annot=True);
plt.subplot(222)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, color='green', label='AUC= %0.2f'%roc_auc);
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1],linestyle='--');
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate');

## Random Forest Classifier

In [None]:
clf = RandomForestClassifier().fit(X_train, y_train)
accuracy=cross_val_score(clf, X_train , y_train, cv=10, scoring='accuracy')
y_pred = clf.predict(X_test)
confusion = confusion_matrix(y_test,y_pred)
auc_roc = roc_auc_score(y_test,y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print('\033[1m','Cross val accuracy: ', '\033[0m', accuracy, '\033[1m', '\n\n', 'Mean of cross val accuracies: ', '\033[0m', np.mean(accuracy))
plt.figure(figsize=(15, 15))
plt.subplot(221)
sns.heatmap(confusion, cmap='summer', annot=True);
plt.subplot(222)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, color='green', label='AUC= %0.2f'%roc_auc);
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1],linestyle='--');
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate');

The highest accuracy is given by random forest classifier an accuracy of 1.0. Decision tree classifier gave an accuracy of 0.99982 and logistic regression 0.99964. Gaussian Naive Bayes gave the lowest accuracy among these 4 models choosen an accuracy of 0.9525.