In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency 
from sklearn.preprocessing  import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score
import warnings


warnings.simplefilter("ignore")
pd.set_option("max_columns", None)

In [None]:
class CONFIG:
    path = "../input/mushroom-classification/mushrooms.csv"
    seed = 42
    folds = 5

In [None]:
dataset = pd.read_csv(CONFIG.path)

# Exploratory Data Analysis

## Columns' descriptions

<ul>
<li><b>class</b>: edible=e, poisonous=p

<li><b>cap-shape</b>: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

<li><b>cap-surface</b>: fibrous=f,grooves=g,scaly=y,smooth=s

<li><b>cap-color</b>: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

<li><b>bruises</b>: bruises=t,no=f

<li><b>odor</b>: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

<li><b>gill-attachment</b>: attached=a,descending=d,free=f,notched=n

<li><b>gill-spacing</b>: close=c,crowded=w,distant=d

<li><b>gill-size</b>: broad=b,narrow=n

<li><b>gill-color</b>: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

<li><b>stalk-shape</b>: enlarging=e,tapering=t

<li><b>stalk-root</b>: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

<li><b>stalk-surface-above-ring</b>: fibrous=f,scaly=y,silky=k,smooth=s

<li><b>stalk-surface-below-ring</b>: fibrous=f,scaly=y,silky=k,smooth=s

<li><b>stalk-color-above-ring</b>: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

<li><b>stalk-color-below-ring</b>: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

<li><b>veil-type</b>: partial=p,universal=u

<li><b>veil-color</b>: brown=n,orange=o,white=w,yellow=y

<li><b>ring-number</b>: none=n,one=o,two=t

<li><b>ring-type</b>: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

<li><b>spore-print-color</b>: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

<li><b>population</b>: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

<li><b>habitat</b>: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d
</ul>

In [None]:
def hide_spines(ax, spines=["top", "right", "left", "bottom"]):
    for spine in spines:
        ax.spines[spine].set_visible(False)

In [None]:
colors = ["#ED3E2F", "#85BB65"]
magma = sns.color_palette("magma")
sns.palplot(colors)
sns.palplot(magma)

## First view

In [None]:
dataset

## *class* Analysis

In [None]:
fig = plt.figure(figsize=(7, 7))
fig.set_facecolor("#fff")

ax = fig.add_subplot()
ax.set_facecolor("#fff")

sns.countplot(x="class", data=dataset, palette=colors, ec="#000", alpha=1, linewidth=1.25, zorder=2, ax=ax)
ax.grid(axis="y", linewidth=1.5, color="lightgrey", linestyle="--", zorder=0)
ax.xaxis.set_tick_params(size=0, labelsize=14, pad=7)
ax.set_xlabel("Class", fontsize=17, fontfamily="serif", labelpad=7)
ax.yaxis.set_tick_params(size=0, labelsize=12, pad=7)
ax.set_ylabel("Count", fontsize=14, fontfamily="serif", labelpad=7)
hide_spines(ax, spines=["top", "right", "left"])
ax.spines["bottom"].set(linewidth=2.5)

ax.set_title("class Distribution", fontsize=20, fontweight="bold", fontfamily="serif", loc="left")
ax.set_ylim(1)
fig.show()

## Features Analysis

In [None]:
features = dataset.columns[1:].tolist()
print(features, len(features))

In [None]:
rows = 8
cols = 3
fig = plt.figure(figsize=(15, 25))
fig.set_facecolor("#fff")
for idx, feature in enumerate(features):
    ax = fig.add_subplot(rows, cols, idx+1)
    ax.set_facecolor("#fff")
    ax.grid(axis="y", linewidth=1, color="lightgrey", linestyle="--", zorder=0)
    sns.countplot(x=feature, palette=magma_palette, data=dataset, ec="#000", alpha=1, linewidth=1.5, zorder=2)
    ax.xaxis.set_tick_params(size=0, labelsize=12, pad=7)
    ax.yaxis.set_tick_params(size=0, labelsize=10, pad=7)
    
    if idx % cols == 0:
        ax.set_ylabel("Count", fontsize=14, fontfamily="serif", labelpad=7)
    else:
        ax.set_ylabel("")
        
    ax.set_xlabel(feature, fontsize=14, fontfamily="serif", labelpad=7)
    hide_spines(ax, spines=["top", "right", "left"])
    ax.spines["bottom"].set(linewidth=2)
    ax.set_ylim(1)

fig.text(x=0.05, y=1.01, s="Features Distributions", fontsize=22, fontweight="bold", fontfamily="serif")
fig.tight_layout(w_pad=2, h_pad=1.5)
fig.show()

## *class* - Features Relationships

In [None]:
rows = 8
cols = 3
magma_palette = sns.color_palette("magma")
fig = plt.figure(figsize=(15, 25))
fig.set_facecolor("#fff")
for idx, feature in enumerate(features):
    ax = fig.add_subplot(rows, cols, idx+1)
    ax.set_facecolor("#fff")
    ax.grid(axis="y", linewidth=1, color="lightgrey", linestyle="--", zorder=0)
    sns.countplot(x=feature, palette=colors, hue="class", data=dataset, ec="#000", alpha=1, linewidth=1.5, zorder=2)
    
    feature_data_p = dataset[dataset["class"] == "p"][feature].value_counts()
    feature_data_e = dataset[dataset["class"] == "e"][feature].value_counts()
    
    for idx_p in feature_data_p.index:
        if idx_p not in feature_data_e.index:
            feature_data_e[idx_p] = 0
    
    for idx_e in feature_data_e.index:
        if idx_e not in feature_data_p.index:
            feature_data_p[idx_e] = 0
    
    
    feature_data_p = feature_data_p.sort_index()
    feature_data_e = feature_data_e.sort_index()
    
    crosstable = pd.crosstab(feature_data_p.values, feature_data_e.values)
    stat, p_value, dof , _ = chi2_contingency(crosstable)
    
    if idx % cols == 0:
        ax.set_ylabel("Count", fontsize=14, fontfamily="serif", labelpad=7)
    else:
        ax.set_ylabel("")
        
    ax.set_xlabel(feature, fontsize=14, fontfamily="serif", labelpad=7)
    hide_spines(ax, spines=["top", "right", "left"])
    ax.spines["bottom"].set(linewidth=2)
    ax.axvline(x=0, label=f"chi^2 p_value: {np.round(p_value, 2)}", visible=False, color="#000")
    ax.set_ylim(1)
    ax.legend()

fig.text(x=0.05, y=1.01, s="class - Features Distributions", fontsize=22, fontweight="bold", fontfamily="serif")
fig.tight_layout(w_pad=2, h_pad=1.5)
fig.show()

In [None]:
most_useful_features = ["cap-shape", "cap-surface", "cap-color", "odor", "gill-color", "stalk-surface-below-ring", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat", "class"]
cleaned_dataset = dataset[most_useful_features]
cleaned_dataset

# Data Preprocessing

In [None]:
labels = cleaned_dataset.pop("class").map({"e": 0, "p": 1}).values

In [None]:
labels[:5]

In [None]:
def ordinal_encode(X):
    encoder = OrdinalEncoder().fit(X)
    return encoder.transform(X)

In [None]:
encoded_dataset = ordinal_encode(cleaned_dataset)
encoded_dataset.shape

# Model Building

In [None]:
strategy = KFold(n_splits=CONFIG.folds, random_state=CONFIG.seed, shuffle=True)
folds = strategy.split(encoded_dataset, labels)
accuracies, precisions = [], []
for fold_idx, (train_indexes, test_indexes) in enumerate(folds):
    print(f"Fold: [{fold_idx+1}/{CONFIG.folds}]", end=": ")
    train_data, train_labels = encoded_dataset[train_indexes], labels[train_indexes]
    test_data, test_labels = encoded_dataset[test_indexes], labels[test_indexes]
    
    fold_model = DecisionTreeClassifier().fit(train_data, train_labels)
    
    predictions = fold_model.predict(test_data)
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    print(f"Accuracy: {accuracy}", end=" ")
    print(f"Precision: {precision}", end="\n"*2)
    accuracies.append(accuracy)
    precisions.append(precision)
    
accuracies, precisions = np.array(accuracies), np.array(precisions)
print(f"Mean Accuracy: {accuracies.mean()}\nMean Precision: {precisions.mean()}")