In [None]:
import shap
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [None]:
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv', delimiter=',')
data.head(5)

In [None]:
print(data.isnull().any().sum())
print(data.isna().any().sum())

In [None]:
dist = sns.barplot(x=data['class'].unique(), y=data['class'].value_counts())
dist.set(xlabel = 'Class', ylabel = 'Count', title = 'Distribution of Class categories')

# Indicatives values per feature

In [None]:
data_dummies = pd.get_dummies(data)
data_dummies.head()

In [None]:
edibility = data_dummies.iloc[:, 0:1]

## Cap shape 

In [None]:
feature = data_dummies.iloc[:, 2:8]
feature = pd.concat([edibility, feature], axis=1, sort=False)
feature.head()

In [None]:
plt.subplots(figsize=(25,10))
corr = feature.corr()
sub_corr = corr[['cap-shape_b','cap-shape_c', 'cap-shape_f','cap-shape_k','cap-shape_s','cap-shape_x']]

ax = sns.heatmap(sub_corr.iloc[0:1],
                 vmin=-0.5, vmax=0.5, center=0,
                 cmap=sns.diverging_palette(20, 220, n=200),square=True)

ax.set_title('Correlation between Mushroom Cap Shape and Edibility')
ax.set_ylabel('Edibility')
ax.set_xlabel('Cap Shape')

In [None]:
feature = data_dummies.iloc[:, 8:12]
feature = pd.concat([edibility, feature], axis=1, sort=False)

plt.subplots(figsize=(25,10))
corr = feature.corr()
sub_corr = corr[['cap-surface_f','cap-surface_g', 'cap-surface_s','cap-surface_y']]

ax = sns.heatmap(sub_corr.iloc[0:1],
                 vmin=-0.5, vmax=0.5, center=0,
                 cmap=sns.diverging_palette(20, 220, n=200),square=True)

ax.set_title('Correlation between Mushroom Cap Surface and Edibility')
ax.set_ylabel('Edibility')
ax.set_xlabel('Cap Surface')

In [None]:
feature = data_dummies.iloc[:, 12:22]
feature = pd.concat([edibility, feature], axis=1, sort=False)

plt.subplots(figsize=(25,10))
corr = feature.corr()
sub_corr = corr[['cap-color_b','cap-color_c', 'cap-color_e','cap-color_g', 'cap-color_n', 'cap-color_p', 'cap-color_r', 'cap-color_u', 'cap-color_w', 'cap-color_y']]

ax = sns.heatmap(sub_corr.iloc[0:1],
                 vmin=-0.5, vmax=0.5, center=0,
                 cmap=sns.diverging_palette(20, 220, n=200),square=True)

ax.set_title('Correlation between Mushroom Cap Color and Edibility')
ax.set_ylabel('Edibility')
ax.set_xlabel('Cap Surface')

# RandomForrestClassifier usage

In [None]:
train_x, test_x = train_test_split(data_dummies, test_size=0.2, random_state=0)
train_y = train_x.pop('class_e')
train_y = train_x.pop('class_p')
test_y = test_x.pop('class_e')
test_y = test_x.pop('class_p')
print(train_y.shape, train_x.shape, test_y.shape, test_x.shape)

In [None]:
model = RandomForestClassifier()
model.fit(train_x, train_y)

preds = model.predict(test_x)
preds

In [None]:
accuracy = accuracy_score(test_y, preds)
accuracy