In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy.stats import chi2_contingency

from sklearn.preprocessing  import OrdinalEncoder, OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_roc_curve, classification_report
print("Setup complete")

## Exploratory Data Analysis


**Columns' descriptions**

* **class:** edible=e, poisonous=p
* **cap-shape**: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
* **cap-surface**: fibrous=f,grooves=g,scaly=y,smooth=s
* **cap-color**: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
* **bruises**: bruises=t,no=f
* **odor**: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
* **gill-attachment**: attached=a,descending=d,free=f,notched=n
* **gill-spacing**: close=c,crowded=w,distant=d
* **gill-size**: broad=b,narrow=n
* **gill-color**: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
* **stalk-shape**: enlarging=e,tapering=t
* **stalk-root**: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
* **stalk-surface-above-ring**: fibrous=f,scaly=y,silky=k,smooth=s
* **stalk-surface-below-ring**: fibrous=f,scaly=y,silky=k,smooth=s
* **stalk-color-above-ring**: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* **stalk-color-below-ring**: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* **veil-type**: partial=p,universal=u
* **veil-color**: brown=n,orange=o,white=w,yellow=y
* **ring-number**: none=n,one=o,two=t
* **ring-type**: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
* **spore-print-color**: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
* **population**: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
* **habitat**: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d 



In [None]:
# import dataset
df = pd.read_csv("../input/mushroom-classification/mushrooms.csv")

# Set option to display max columns
pd.set_option("display.max_columns", None)

# View dataset 
df.head()

In [None]:
# Check the value counts of target class
import seaborn as sns

print("Distribution.....")
print(df['class'].value_counts())

sns.countplot(x='class', data=df)

In [None]:
# Describe 
df.describe()

In [None]:
def hide_spines(ax, spines=["top", "right", "left", "bottom"]):
    for spine in spines:
        ax.spines[spine].set_visible(False)

In [None]:
features = df.columns[1:].tolist()
print(features, len(features))

## Feature Distributions

In [None]:
rows = 8
cols = 3
mako_palette = sns.color_palette("mako")
fig = plt.figure(figsize=(15, 25))
fig.set_facecolor("#fff")
for idx, feature in enumerate(features):
    ax = fig.add_subplot(rows, cols, idx+1)
    ax.set_facecolor("#fff")
    ax.grid(axis="y", linewidth=1, color="green", linestyle="--", zorder=0)
    sns.countplot(x=feature, palette=mako_palette, data=df, ec="#000", alpha=1, linewidth=1.5, zorder=2)
    ax.xaxis.set_tick_params(size=0, labelsize=12, pad=7)
    ax.yaxis.set_tick_params(size=0, labelsize=10, pad=7)
    
    if idx % cols == 0:
        ax.set_ylabel("Count", fontsize=14, fontfamily="serif", labelpad=7)
    else:
        ax.set_ylabel("")
        
    ax.set_xlabel(feature, fontsize=14, fontfamily="serif", labelpad=7)
    hide_spines(ax, spines=["top", "right", "left"])
    ax.spines["bottom"].set(linewidth=2)
    ax.set_ylim(1)

fig.text(x=0.05, y=1.01, s="Features Distributions", fontsize=22, fontweight="bold", fontfamily="serif")
fig.tight_layout(w_pad=2, h_pad=1.5)
fig.show()

In [None]:

rows = 8
cols = 3
colors = ['ffc0af', '878787']
rocket_palette = sns.color_palette("rocket")
fig = plt.figure(figsize=(15, 25))
fig.set_facecolor("#fff")
for idx, feature in enumerate(features):
    ax = fig.add_subplot(rows, cols, idx+1)
    ax.set_facecolor("#fff")
    ax.grid(axis="y", linewidth=1, color="green", linestyle="--", zorder=0)
    sns.countplot(x=feature, palette=rocket_palette ,hue="class", data=df, ec="#000", alpha=1, linewidth=1.5, zorder=2)
    
    feature_data_p = df[df["class"] == "p"][feature].value_counts()
    feature_data_e = df[df["class"] == "e"][feature].value_counts()
    
    for idx_p in feature_data_p.index:
        if idx_p not in feature_data_e.index:
            feature_data_e[idx_p] = 0
    
    for idx_e in feature_data_e.index:
        if idx_e not in feature_data_p.index:
            feature_data_p[idx_e] = 0
    
    
    feature_data_p = feature_data_p.sort_index()
    feature_data_e = feature_data_e.sort_index()
    
    crosstable = pd.crosstab(feature_data_p.values, feature_data_e.values)
    stat, p_value, dof , _ = chi2_contingency(crosstable)
    
    if idx % cols == 0:
        ax.set_ylabel("Count", fontsize=14, fontfamily="serif", labelpad=7)
    else:
        ax.set_ylabel("")
        
    ax.set_xlabel(feature, fontsize=14, fontfamily="serif", labelpad=7)
    hide_spines(ax, spines=["top", "right", "left"])
    ax.spines["bottom"].set(linewidth=2)
    ax.axvline(x=0, label=f"chi^2 p_value: {np.round(p_value, 2)}", visible=False, color="#000")
    ax.set_ylim(1)
    ax.legend()

fig.text(x=0.05, y=1.01, s="class - Features Distributions", fontsize=22, fontweight="bold", fontfamily="serif")
fig.tight_layout(w_pad=2, h_pad=1.5)
fig.show()

In [None]:
# info
df.info()

We can see all dtype is object

In [None]:
# Check null values
import missingno

print(df.isna().sum())
missingno.matrix(df, figsize=(30,10));

No, Missing values found in the dataset

In [None]:
most_useful_features = ['cap-shape', 'cap-surface', 'cap-color', 'odor',
                       'gill-color', 'stalk-surface-below-ring', 'veil-color',
                       'ring-number', 'ring-type', 'spore-print-color',
                       'population', 'habitat', 'class']
cleaned_dataset = df[most_useful_features]
cleaned_dataset

## Data Preprocessing

In [None]:
# Map the class values as "e" -> 0 and "p" -> 1
labels = cleaned_dataset.pop("class").map({"e":0, "p":1}).values
labels[:10]

In [None]:
# Create a function that turns Categorical values into numerical
def ordinal_encode(X):
    encode = OrdinalEncoder().fit(X)
    return encode.transform(X)

In [None]:
# Pass the dataset to 'ordinal_encode' function
encoded_dataset = ordinal_encode(cleaned_dataset)
encoded_dataset[0]

In [None]:
# Create a dataframe containing most useful features 
data = pd.DataFrame(encoded_dataset, columns=cleaned_dataset.columns)
data['Class'] = pd.Series(labels)

# View top head
data.head()

In [None]:
# View tail
data.tail()

In [None]:
# Correlation
corr = data.corr()
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(corr,
           annot=True,
           linewidth=0.5,
           fmt='.2f',
           cmap='YlGnBu')

## Split the dataset into training and validation

In [None]:
X = data.drop("Class", axis=1) # target features
y = data['Class']# target labels

# Split into training and validation
X_train, X_val, y_train, y_val = train_test_split(X,
                                                 y,
                                                 test_size=0.33,
                                                 random_state=42)

# Check the length
len(X_train), len(X_val), len(y_train), len(y_val)

## Modelling

In [None]:
# RandomForestClassifier
rfc_model = RandomForestClassifier(random_state=42,n_jobs=-1)
rfc_model.fit(X_train, y_train)
preds = rfc_model.predict(X_val)

score = cross_val_score(rfc_model, X, y, cv=5, n_jobs=-1)
rfc_score = np.mean(score)
print(f"RandomForest Score : {rfc_score}")

In [None]:
# decisiontree

dtc_model = DecisionTreeClassifier(random_state=42)
dtc_model.fit(X_train, y_train)
preds = dtc_model.predict(X_val)

score = cross_val_score(dtc_model, X, y, cv=5, n_jobs=-1)
dtc_score = np.mean(score)
print(f"DecisionTree Score :{dtc_score}")

In [None]:
# sgdclassifier
sgd_model = SGDClassifier(random_state=42)
sgd_model.fit(X_train, y_train)
preds = sgd_model.predict(X_val)

score = cross_val_score(sgd_model, X, y, cv=5, n_jobs=-1)
sgd_score = np.mean(score)
print(f"SGD score : {sgd_score}")

In [None]:
# XGBClassifier
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
preds = xgb_model.predict(X_val)

score = cross_val_score(xgb_model,X, y, cv=5, n_jobs=-1)
xgb_score = np.mean(score)
print(f"XGBClassifier score : {xgb_score}")

In [None]:
# catboostclassifier
cat_boost = CatBoostClassifier(random_state=42, verbose=0)
cat_boost.fit(X_train, y_train)
preds = cat_boost.predict(X_val)

score = cross_val_score(cat_boost, X, y, cv=5, n_jobs=-1)
cat_boost_score = np.mean(score)

print(f"Catboost Score : {cat_boost_score}")

In [None]:
# Model comparison
models = pd.DataFrame({"Random Forest Classifier " :rfc_score,
                       "Decision Tree Classifier  ":dtc_score,
                      "SGD Classifier ":sgd_score,
                      "XGB Classifier ":xgb_score,
                      "CatBoost Classifier ":cat_boost_score},index=['Accuracy'])

In [None]:
models.T

Hence, *Decision Tree Classifier* got best accuracy when compared with other models


## Hyperparameter Tuning

In [None]:
# dictionary containing params
grid = {"max_depth":[None, 2, 4,6],
        "min_samples_split" : np.arange(2,20,2),
        "min_samples_leaf": np.arange(1,20,2)}

# set random seed
np.random.seed(42)

# RandomizedSearchCV
randomized_tree = RandomizedSearchCV(DecisionTreeClassifier(),
                               param_distributions=grid,
                               cv=5,
                               n_iter=5,
                               verbose=True)
# Fit the model
randomized_tree.fit(X_train, y_train)

In [None]:
# best params
randomized_tree.best_params_

In [None]:
# score
randomized_tree.score(X_val,y_val)

In [None]:
# predictions
y_preds = randomized_tree.predict(X_val)
y_preds

## Evaluation Metrics

In [None]:
# roc curve
plot_roc_curve(randomized_tree, X_val, y_val)

In [None]:
# accuracy score
acc_score = accuracy_score(y_val, y_preds)
acc_score

In [None]:
# precision score
pre_score = precision_score(y_val, y_preds)
pre_score

In [None]:
# recall score
rec_score = recall_score(y_val,y_preds)
rec_score

In [None]:
# f1-score
f1_score = f1_score(y_val, y_preds)
f1_score

In [None]:
# Confusion Matrix
sns.set(font_scale=1.5)

def plot_conf_mat(y_val,y_preds):
    
    """
    PLots a nice looking confusion matrix using Seaborn's heatmap()
    """
    
    fig,ax = plt.subplots(figsize=(10,5))
    ax = sns.heatmap(confusion_matrix(y_val,y_preds),
                    annot=True,
                    cbar=False)
    
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    
plot_conf_mat(y_val,y_preds)

In [None]:
# classification report
classification_report(y_val, y_preds)

### **Credits: https://www.kaggle.com/vad13irt/uci-ml-mushrooms-classification**
