# Copy of the AWS notebook

This notebook contains the code that was stored within the AWS sagemaker notebook before I tore it down with terraform.

In [None]:
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()
bucket='mushroom-classification-bucket-newly-largely-moral-mustang'
data_key = 'mushrooms.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

df = pd.read_csv(data_location)

In [None]:
mushroom_data = {
    "class": {"e": "edible", "p": "poisonous"},
    "cap-shape": {"b": "bell", "c": "conical", "x": "convex", "f": "flat", "k": "knobbed", "s": "sunken"},
    "cap-surface": {"f": "fibrous", "g": "grooves", "y": "scaly", "s": "smooth"},
    "cap-color": {"n": "brown", "b": "buff", "c": "cinnamon", "g": "gray", "r": "green", "p": "pink", "u": "purple", "e": "red", "w": "white", "y": "yellow"},
    "bruises": {"t": "bruises", "f": "no"},
    "odor": {"a": "almond", "l": "anise", "c": "creosote", "y": "fishy", "f": "foul", "m": "musty", "n": "none", "p": "pungent", "s": "spicy"},
    "gill-attachment": {"a": "attached", "d": "descending", "f": "free", "n": "notched"},
    "gill-spacing": {"c": "close", "w": "crowded", "d": "distant"},
    "gill-size": {"b": "broad", "n": "narrow"},
    "gill-color": {"k": "black", "n": "brown", "b": "buff", "h": "chocolate", "g": "gray", "r": "green", "o": "orange", "p": "pink", "u": "purple", "e": "red", "w": "white", "y": "yellow"},
    "stalk-shape": {"e": "enlarging", "t": "tapering"},
    "stalk-root": {"b": "bulbous", "c": "club", "u": "cup", "e": "equal", "z": "rhizomorphs", "r": "rooted", "?": "missing"},
    "stalk-surface-above-ring": {"f": "fibrous", "y": "scaly", "k": "silky", "s": "smooth"},
    "stalk-surface-below-ring": {"f": "fibrous", "y": "scaly", "k": "silky", "s": "smooth"},
    "stalk-color-above-ring": {"n": "brown", "b": "buff", "c": "cinnamon", "g": "gray", "o": "orange", "p": "pink", "e": "red", "w": "white", "y": "yellow"},
    "stalk-color-below-ring": {"n": "brown", "b": "buff", "c": "cinnamon", "g": "gray", "o": "orange", "p": "pink", "e": "red", "w": "white", "y": "yellow"},
    "veil-type": {"p": "partial", "u": "universal"},
    "veil-color": {"n": "brown", "o": "orange", "w": "white", "y": "yellow"},
    "ring-number": {"n": "none", "o": "one", "t": "two"},
    "ring-type": {"c": "cobwebby", "e": "evanescent", "f": "flaring", "l": "large", "n": "none", "p": "pendant", "s": "sheathing", "z": "zone"},
    "spore-print-color": {"k": "black", "n": "brown", "b": "buff", "h": "chocolate", "r": "green", "o": "orange", "u": "purple", "w": "white", "y": "yellow"},
    "population": {"a": "abundant", "c": "clustered", "n": "numerous", "s": "scattered", "v": "several", "y": "solitary"},
    "habitat": {"g": "grasses", "l": "leaves", "m": "meadows", "p": "paths", "u": "urban", "w": "waste", "d": "woods"}
}
for column, mapping in mushroom_data.items():
    if column in df.columns:
        df[column] = df[column].replace(mapping)
df.head(5)

In [None]:
df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.style.use("ggplot")

print(df["class"].value_counts())
print()
print(df["class"].value_counts() / len(df))
sns.countplot(df["class"])
plt.title("target variable")

In [None]:
plt.style.use("ggplot")

for column in df:
    if column == "class":
        continue
    plt.figure(figsize=(20,4))
    plt.subplot(121)
    df[column].value_counts().plot(kind="bar")
    plt.xlabel(column)
    plt.ylabel("count")
    plt.title(column)

In [None]:
for column in df:
    if column == "class":
        continue
    plt.figure(figsize=(20,4))
    plt.subplot(121)
    sns.countplot(df, x=column, hue="class")
    plt.title(column)    
    plt.xticks(rotation=90)

Observations on the data:
- veil-type has one category, we should therefore exclude it from the features.

In [None]:
df["veil-type"].describe()

In [None]:
print(df["veil-color"].describe())
print()
print(df.groupby("veil-color").count())
# print(df["veil-color"].value_counts(normalize=True))
print()
veil_color_ct = pd.crosstab(df["veil-color"], df["class"], normalize="index")
plt.figure(figsize=(10, 6))
sns.heatmap(veil_color_ct, annot=True, cmap='YlOrRd', fmt='.2f')
plt.show()

Majority of the training cases (97%) are from the category "white", however this category is not indicative of the target class. The other three categories are very indicative. The "orange" and "brown" categories could be merged into a single category to reduce dimensionality, they both result in edible targets. We could test with and without this merge. We might need to add an extra category for "other". In case the test data contains unseen categories.

In [None]:
ct = pd.crosstab(df["habitat"], df["class"], normalize="index")
plt.figure(figsize=(10, 6))
sns.heatmap(ct, annot=True, cmap='YlOrRd', fmt='.2f')
plt.title('Heatmap of Independent vs. Dependent Variables')
plt.xlabel('Dependent Variable')
plt.ylabel('Independent Variable')
plt.show()

In [None]:
%pip install scikit-learn

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown="ignore")
encs = ohe.fit_transform(df.loc[:, df.columns != 'class'])
enc_df = pd.DataFrame(encs.toarray(), columns=ohe.get_feature_names_out(), dtype=int)
enc_df

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(enc_df.corr(), annot=True)