In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv("../input/mushroom-classification/mushrooms.csv")

In [None]:
dataset.info()

No missing rows, but none of the columns have integer data types

In [None]:
dataset["class"].value_counts()

e for edible and p for poisonous

In [None]:
X = dataset.copy().drop("class", axis=1)

In [None]:
y = dataset.copy()["class"]

In [None]:
X.head()

In [None]:
y.head()

Time to split the data into test, validate and train

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 2708, stratify=y)

In [None]:
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size = 1354, stratify=y_test)

In [None]:
def encode_labels(y):
    y = pd.DataFrame(y)
    poisonous = y["class"] == 'p'
    y["poisonous"] = poisonous.astype(int)
    y = y.drop("class", axis=1)
    return y

In [None]:
y_train_copy = y_train.copy()

In [None]:
y_train_copy = encode_labels(y_train_copy)

In [None]:
y_train_copy.head()

In [None]:
binary_attributes = ["bruises", "gill-size", "stalk-shape"]

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
X_train_copy = X_train.copy()

In [None]:
X_train_binary = X_train_copy[binary_attributes]


In [None]:
binary_encoder = OrdinalEncoder()

In [None]:
X_train_binary = binary_encoder.fit_transform(X_train_binary)

In [None]:
X_train_binary = pd.DataFrame(X_train_binary, columns=binary_attributes)

In [None]:
X_train_binary.head()

In [None]:
X_train_categorical = X_train.drop(binary_attributes, axis=1)

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
categorical_encoder = OneHotEncoder(sparse=False)

In [None]:
X_train_catenc = categorical_encoder.fit_transform(X_train_categorical)

In [None]:
X_train_catenc[:10]

In [None]:
categorical_encoder.categories_

In [None]:
encoded_headings = []
for index, array in enumerate(categorical_encoder.categories_):
    for item in array:
        item = list(X_train_categorical)[index] + "__" + item
        encoded_headings.append(item)

In [None]:
encoded_headings

In [None]:
X_train_catenc = pd.DataFrame(X_train_catenc, columns=encoded_headings)

In [None]:
X_train_catenc.head()

In [None]:
X_train_encoded = pd.concat([X_train_binary, X_train_catenc], axis=1)

In [None]:
X_train_encoded.head()

All data is now encoded into one DataFrame of floats. The dimensions of the DataFrame are however very large at 114 dimension. We can reduce this by implementing Principle Component Analysis to reduce dimensionality a little whilst preserving variance

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=0.95)

In [None]:
X_train_reduced = pca.fit_transform(X_train_encoded)

In [None]:
len(X_train_reduced[0])

The data is now down to only 41 dimensions

It is now a good time to build a pipeline so we can start training some models

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
encoding = ColumnTransformer([
    ("binary", binary_encoder, binary_attributes),
    ("categories", categorical_encoder, list(X_train_categorical))
])

In [None]:
pipeline = Pipeline([
    ("encoding", encoding),
    ("pca", pca)
])

In [None]:
X_train_1 = pipeline.fit_transform(X_train.copy())

In [None]:
y_train_1 = encode_labels(y_train.copy()).values.reshape(-1,)

In [None]:
from sklearn.linear_model import RidgeClassifier

In [None]:
ridge_clf = RidgeClassifier()

In [None]:
ridge_clf.fit(X_train_1, y_train_1)

In [None]:
X_valid_1 = pipeline.transform(X_valid.copy())

In [None]:
y_valid_1 = encode_labels(y_valid.copy()).values.reshape(-1,)

In [None]:
ridge_clf.score(X_valid_1, y_valid_1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
kn_clf = KNeighborsClassifier()

In [None]:
kn_clf.fit(X_train_1, y_train_1)

In [None]:
kn_clf.score(X_valid_1, y_valid_1)

In [None]:
X_train_2 = encoding.fit_transform(X_train.copy())

In [None]:
X_valid_2 = encoding.transform(X_valid.copy())

In [None]:
kn_clf2 = KNeighborsClassifier()

In [None]:
kn_clf2.fit(X_train_2, y_train_1)

In [None]:
kn_clf2.score(X_valid_2, y_valid_1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
rf_clf.fit(X_train_2, y_train_1)

In [None]:
rf_clf.score(X_valid_2, y_valid_1)

In [None]:
rf_clf.feature_importances_

In [None]:
attribs = list(X_train_encoded)

In [None]:
sorted(zip(rf_clf.feature_importances_, list(X_train_encoded)), reverse=True)

Here we can see that our Random Forest Classifier (which scored 100% on the validation set) identified that the 10 most important features for determining if the mushrooms are poisonous or not are: If a mushroom has no odor, if a mushroom has a foul odor, the gill size of the mushroom, whether the gill color is buff, whether the ring is of type - pendant, whether or not the mushroom has bruises, the population abundance of the mushroom (v meaning several), if the spore print color is chocolate, if the stalk surface above the ring is silky and if the stalk surface below the ring is silky. Now I will see if I can train a strong model on just these ten attributes

In [None]:
t10_attribs = ["odor__n", "odor__f", "gill-size", "gill-color__b", "ring-type__p", "bruises", "population__v", "spore-print-color__h", "stalk-surface-above-ring__k", "stalk-surface-below-ring__k"]

In [None]:
X_train_t10 = pd.DataFrame(encoding.fit_transform(X_train.copy()), columns=attribs)[t10_attribs]
X_valid_t10 = pd.DataFrame(encoding.transform(X_valid.copy()), columns=attribs)[t10_attribs]

In [None]:
y_train_t10 = y_train_1.copy()
y_valid_t10 = y_valid_1.copy()

In [None]:
rf_clf_t10 = RandomForestClassifier()

In [None]:
rf_clf_t10.fit(X_train_t10, y_train_t10)

In [None]:
rf_clf_t10.score(X_valid_t10, y_valid_t10)

In [None]:
kn_clf_t10 = KNeighborsClassifier()

In [None]:
kn_clf_t10.fit(X_train_t10, y_train_t10)

In [None]:
kn_clf_t10.score(X_valid_t10, y_valid_t10)

Looks like we need more than just 10 attributes to train a 100% accurate model, lets try with the top 15 most important attributes

In [None]:
t15_attribs = t10_attribs + ["ring-type__l", "gill-spacing__c", "stalk-surface-above-ring__s", "stalk-shape", "stalk-root__b"]

In [None]:
X_train_t15 = pd.DataFrame(encoding.fit_transform(X_train.copy()), columns=attribs)[t15_attribs]
X_valid_t15 = pd.DataFrame(encoding.transform(X_valid.copy()), columns=attribs)[t15_attribs]

In [None]:
y_train_t15 = y_train_1.copy()
y_valid_t15 = y_valid_1.copy()

In [None]:
rf_clf_t15 = RandomForestClassifier()

In [None]:
rf_clf_t15.fit(X_train_t15, y_train_t15)

In [None]:
rf_clf_t15.score(X_valid_t15, y_valid_t15)

In [None]:
kn_clf_t15 = KNeighborsClassifier()

In [None]:
kn_clf_t15.fit(X_train_t15, y_train_t15)

In [None]:
kn_clf_t15.score(X_valid_t15, y_valid_t15)

In [None]:
t20_attribs = t15_attribs + ["gill-spacing__w", "odor__p", "spore-print-color__w", "stalk-root__e", "odor__a"]

In [None]:
X_train_t20 = pd.DataFrame(encoding.fit_transform(X_train.copy()), columns=attribs)[t20_attribs]
X_valid_t20 = pd.DataFrame(encoding.transform(X_valid.copy()), columns=attribs)[t20_attribs]

In [None]:
y_train_t20 = y_train_t10.copy()
y_valid_t20 = y_valid_t10.copy()

In [None]:
rf_clf_t20 = RandomForestClassifier()

In [None]:
rf_clf_t20.fit(X_train_t20, y_train_t20)

In [None]:
rf_clf_t20.score(X_valid_t20, y_valid_t20)

In [None]:
kn_clf_t20 = KNeighborsClassifier()

In [None]:
kn_clf_t20.fit(X_train_t20, y_train_t20)

In [None]:
kn_clf_t20.score(X_valid_t20, y_valid_t20)

So we in fact only needed 20 attributes to determine whether or not a mushroom is poisonous, lets finally try out our model on the test set.

In [None]:
X_test_t20 = pd.DataFrame(encoding.transform(X_test.copy()), columns=attribs)[t20_attribs]
y_test_t20 = encode_labels(y_test.copy()).values.reshape(-1,)

In [None]:
rf_clf_t20.score(X_test_t20, y_test_t20)

In [None]:
kn_clf_t20.score(X_test_t20, y_test_t20)

So the model fits new data perfectly! Now lets see which attributes are most important for determining whether or not a mushroom is poisonous

In [None]:
y_train_t20

In [None]:
corr_y = pd.DataFrame(y_train_t20, columns=["poisonous"])

In [None]:
corr = pd.concat([corr_y, X_test_t20], axis=1)

In [None]:
corr.head()

In [None]:
corr_matrix = corr.corr()

In [None]:
corr_matrix["poisonous"]

So we can see that the top 3 attributes that best determine a poisonous mushroom are: small gill, white spore print and no pungent odor