# Mushrooms!

Data source: https://www.kaggle.com/uciml/mushroom-classification

Attempting to 
 - build a classification model to determine if mushrooms are edible
 - determine which machine learning algorithm models the data best
 - determine which feature in the data is most important in determining whether a mushroom is poisonous 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("../input/mushroom-classification/mushrooms.csv")

In [None]:
data.info()

8124 instances, no missing/null data

In [None]:
data.head()

# Exploratory Analysis

In [None]:
overview=data.describe()
data.describe()

All columns are descriptive - no numerical data at all


In [None]:
overview[overview==1].loc["unique"]
# veil-type feature is useless, we can drop it

In [None]:
sns.countplot(x="class",data=data)
# fairly even split between classes

In [None]:
sns.countplot(x="class",data=data,hue="cap-shape")
# very few conical or sunken cap shapes
# very few poisonous mushrooms have a sunken shape
# very few edible mushrooms have a conical shape

In [None]:
sns.countplot(x="class",hue="cap-color",data=data)
# very few purple, cinnamon, green mushroom caps

In [None]:
sns.countplot(x="class",hue="bruises",data=data)
# poisononous mushrooms are less likely to have bruises

In [None]:
sns.countplot(x="class",hue="odor",data=data)
# edible mushrooms only seem to have an almond, anise or no odour
# very few poisonous mushrooms have almond or anise odour

In [None]:
sns.countplot(x="class",hue="gill-size",data=data)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
sns.countplot(ax=axes[0],x="gill-attachment",data=data)
sns.countplot(ax=axes[1],x="class", hue="gill-attachment",data=data)
# almost all instances have no gill attachment, CONSIDER REMOVING FEATURE
# when gills are attached, they are more likely to be edible

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
sns.countplot(ax=axes[0],x="gill-spacing",data=data)
sns.countplot(x="class",hue="gill-spacing",data=data)
# the majority of instances have close gill spacing, no "distant" instances at all, consider removing
# if mushroom has crowded gill spacing, more likely to be edible

In [None]:
sns.countplot(x="class",hue="gill-color",data=data)
# the vast majority of mushrooms with buff gills are poisonous

In [None]:
sns.countplot(x="class",hue="stalk-shape",data=data)
# fairly even split

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15,10))
sns.countplot(ax=axes[0,0],x="stalk-surface-above-ring",data=data)
sns.countplot(ax=axes[0,1],x="class",hue="stalk-surface-above-ring",data=data)
sns.countplot(ax=axes[1,0],x="stalk-surface-below-ring",data=data)
sns.countplot(ax=axes[1,1],x="class",hue="stalk-surface-below-ring",data=data)
# fairly mixed
# very few yellow above-stalk instances

In [None]:
sns.countplot(x="veil-color",data=data)
# vast majority of veils are white, CONSIDER REMOVING

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15,10))
sns.countplot(ax=axes[0,0],x="ring-number",data=data)
sns.countplot(ax=axes[0,1],x="class", hue="ring-number",data=data)
sns.countplot(ax=axes[1,0],x="ring-type",data=data)
sns.countplot(ax=axes[1,1],x="class", hue="ring-type",data=data)
# majority of instances have one ring, possibly worth removing - test
# edible mushrooms more likely to have two rings
# poisonous mushrooms more likely to have no rings
# fairly mixed ring types, quite few flairing and none types

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
sns.countplot(ax=axes[0],x="spore-print-color",data=data)
sns.countplot(ax=axes[1],x="class", hue="spore-print-color",data=data)
#fairly mixed

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
sns.countplot(ax=axes[0],x="population",data=data)
sns.countplot(ax=axes[1],x="class", hue="population",data=data)
# fairly mixed populations, numerous and abundant populations seem to indicate edible

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
sns.countplot(ax=axes[0],x="habitat",data=data)
sns.countplot(ax=axes[1],x="class", hue="habitat",data=data)
# fairly mixed habitats

Useless features: gill-attachment, veil-type  #WRONG gill-attachment was useful, look at skewing in the data

Less useful features: veil-color

Consider for both: gill-spacing

# Data Preparation and Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data.columns

In [None]:
# Removed gill-attachment and veil-type features
X = data[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']]
y = data['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# experimenting with different encoding methods
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

In [None]:
# transforming X using ordinal encoding
ordarray_train = ordinal_encoder.fit_transform(X_train)
orddata_train = pd.DataFrame(np.array(ordarray_train),columns=X.columns)
ordarray_test = ordinal_encoder.fit_transform(X_test)
orddata_test = pd.DataFrame(np.array(ordarray_test),columns=X.columns)

In [None]:
# transforming X using one hot encoding
hotdata_train = pd.get_dummies(X_train,X_train.columns,drop_first = False)
hotdata_test = pd.get_dummies(X_test,X_test.columns,drop_first = False)

# Models

**Model 1 - Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg_ord = LogisticRegression(max_iter=500)
log_reg_hot = LogisticRegression(max_iter=500)
log_reg_ord.fit(orddata_train,y_train)
log_reg_hot.fit(hotdata_train,y_train)

In [None]:
predict_ord = log_reg_ord.predict(orddata_test)
predict_hot = log_reg_hot.predict(hotdata_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
# from context, we should prioritise
print(classification_report(y_test,predict_ord))

In [None]:
# One hot encoder achieves 100% accuracy on this set, possibly overfitting
print(classification_report(y_test,predict_hot))

In [None]:
log_reg_hot.classes_

In [None]:
# if no labels are given to confusion matrix, matrix indices represent existing classes sorted in order (same as classes_ attribute) 
# positive class is "poisonous"
# from context, we should aim to minimise False Negatives (labeling a poisonous mushroom as edible)
confusion_matrix(y_test,predict_ord)
# predicted 69 mushrooms were edible that were poisonous

In [None]:
confusion_matrix(y_test,predict_hot)

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [None]:
# using cross validation on 20 folds to determine whether model is over fitting
# seems neither model overfits to training split
ord_scores = cross_val_score(log_reg_ord, orddata_train, y_train, cv=20, scoring="accuracy")
ord_scores

In [None]:
# a logistic regression model with one hot encoding seems to be an extremely good model
hot_scores = cross_val_score(log_reg_hot, hotdata_train, y_train, cv=20, scoring="accuracy")
hot_scores

In [None]:
hot_log_reg_predict = cross_val_predict(log_reg_hot, hotdata_train, y_train, cv=20)

In [None]:
log_reg_hot.classes_

In [None]:
print(confusion_matrix(hot_log_reg_predict,y_train))

In [None]:
print(classification_report(hot_log_reg_predict,y_train))

**Model 2 - Decision Tree and Random Forest**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# one hot encoding led to better results with Logistic regression so try this first
hot_tree = DecisionTreeClassifier(max_depth=3)
hot_tree.fit(hotdata_train, y_train)

In [None]:
hot_tree_scores = cross_val_score(hot_tree, hotdata_train, y_train, cv=20)
hot_tree_scores

In [None]:
ord_tree = DecisionTreeClassifier(max_depth=3)
ord_tree.fit(orddata_train, y_train)

In [None]:
# ordinal encoding again generally lead to lower scores
ord_tree_scores = cross_val_score(ord_tree, orddata_train, y_train, cv=20)
ord_tree_scores

In [None]:
ord_tree_predict = cross_val_predict(ord_tree, orddata_train, y_train, cv=20)
hot_tree_predict = cross_val_predict(hot_tree, hotdata_train, y_train, cv=20)

In [None]:
print(confusion_matrix(ord_tree_predict,y_train))

In [None]:
print(classification_report(y_train,ord_tree_predict))

In [None]:
# again one hot encoding produces a better classifier
print(confusion_matrix(hot_tree_predict,y_train))

In [None]:
print(classification_report(y_train,hot_tree_predict))

In [None]:
# use a grid search to find an optimised tree depth
from sklearn.model_selection import GridSearchCV

In [None]:
tree_grid = [
    {'max_depth':[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]},
]

grid_search = GridSearchCV(hot_tree, tree_grid,cv=5,return_train_score=True)

In [None]:
grid_search.fit(hotdata_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
# decision tree with depth of 8 and one hot encoding has similar accuracy to log_reg_hot
opt_scores = cross_val_score(DecisionTreeClassifier(max_depth=8), hotdata_train, y_train, cv=20, scoring="accuracy")
opt_scores

In [None]:
opt_tree_predict = cross_val_predict(DecisionTreeClassifier(max_depth=8), hotdata_train, y_train, cv=20)

In [None]:
print(confusion_matrix(opt_tree_predict,y_train))

In [None]:
print(classification_report(opt_tree_predict,y_train))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rnd_hot = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16)
rnd_hot.fit(hotdata_train,y_train)

In [None]:
rnd_hot_predict = rnd_hot.predict(hotdata_test)

In [None]:
print(confusion_matrix(rnd_hot_predict,y_test))

In [None]:
print(classification_report(y_test,rnd_hot_predict))

In [None]:
# gridsearch to optimise hyperparameters
forest_grid = [
    {'n_estimators':[30,40,50,70,80,90,100],'max_leaf_nodes':[15,20,30,40]},
]
forest_search = GridSearchCV(rnd_hot, forest_grid,cv=5,return_train_score=True)

In [None]:
forest_search.fit(hotdata_train,y_train)

In [None]:
forest_search.best_params_

In [None]:
opt_rnd_hot = RandomForestClassifier(n_estimators=50,max_leaf_nodes=20)
opt_rnd_hot.fit(hotdata_train,y_train)

In [None]:
opt_rnd_hot_predict = opt_rnd_hot.predict(hotdata_test)

In [None]:
opt_rnd_hot_predict = cross_val_predict(opt_rnd_hot, hotdata_train, y_train, cv=20)

In [None]:
print(confusion_matrix(y_train,opt_rnd_hot_predict))

In [None]:
print(classification_report(y_train,opt_rnd_hot_predict))

# Summary

After transforming the data via One-Hot Encoding, there are various algorithms that produce a near perfect performance. However, from the context of the data, we should prioritise recall over precision which means aiming to minimise False Negatives. As a result, the Random Forest model seems to be the best.

In [None]:
final_pred = opt_rnd_hot.predict(hotdata_test)

In [None]:
print(confusion_matrix(final_pred,y_test))

In [None]:
print(classification_report(final_pred,y_test))

In [None]:
means = []
big_index = 0
for index,count in enumerate(X.describe().loc['unique']):
    means.append(np.mean(opt_rnd_hot.feature_importances_[big_index:big_index+count]))
    big_index+=count

In [None]:
feature_importances = pd.DataFrame(list(zip(X_train.columns,means)),columns=['col', 'importance'])
feature_importances.sort_values(by='importance', ascending=False)

Gill-size seems to be the most important feature in determining if a mushroom is poisonous, followed by bruises and odor.