In [1]:
import pandas as pd

In [19]:
csv_file = "../02_models/mushroom_data_clean.csv"
mushroom_df = pd.read_csv(csv_file)

mushroom_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e


In [20]:
# First dropping features that are impractical for a hiker to investigate in the wild

mushroom_df.drop(columns=["spore-print-color", "stalk-root"], inplace=True)

In [21]:
# Dropping features that the average hiker might not know how to asses (confusing attribute options)
# Example: Ring-Type attributes inclue: cobwebby, evanescent, flaring, zone

mushroom_df.drop(columns=["ring-type", "gill-attachment"], inplace=True)

In [22]:
# Binning attributes in ways that make predictors easier for the average person to identify
# Example: combining the color options of Brown, Buff, Chocolate and Cinnamon into one value
# First, creating the binning dictionaries

# Brown, Buff, Chocolate, Cinnamon binned to Brown
# Grey and White binned to White
# Red, Pink, Purple binned to Pink
color_bin = {"n": "b", "c": "b", "h": "b", "g": "w", "u": "p", "e": "p"}

# Close and Crowded binned to Close
spacing_bin = {"w": "c"} 

# Abundant, Clustered, Numerous binned to Clustered
population_bin = {"a": "c", "n": "c"}

In [23]:
# Applying the bins to the data

mushroom_df["cap-color"].replace(color_bin, inplace=True)
mushroom_df["gill-color"].replace(color_bin, inplace=True)
mushroom_df["stalk-color-above"].replace(color_bin, inplace=True)
mushroom_df["stalk-color-below"].replace(color_bin, inplace=True)
mushroom_df["veil-color"].replace(color_bin, inplace=True)
mushroom_df["gill-spacing"].replace(spacing_bin, inplace=True)
mushroom_df["population"].replace(population_bin, inplace=True)


In [24]:
# The Bruising feature is true/false, so setting that to 0/1
# The Ring-Number values are 0, 1, 2, so assigning those as integer values
# Setting the target to 1 for poisonous and 0 for edible

mushroom_df["bruises"].replace({"f": 0, "t": 1}, inplace=True)
mushroom_df["ring-number"].replace({"n": 0, "o": 1, "t": 2}, inplace=True)
mushroom_df["class"].replace({"e": 0, "p": 1}, inplace=True)

mushroom_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-spacing,gill-size,gill-color,stalk-shape,stalk-surface-above,stalk-surface-below,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,population,habitat,class
0,x,s,b,1,p,c,n,k,e,s,s,w,w,p,w,1,s,u,1
1,x,s,y,1,a,c,b,k,e,s,s,w,w,p,w,1,c,g,0
2,b,s,w,1,l,c,b,b,e,s,s,w,w,p,w,1,c,m,0
3,x,y,w,1,p,c,n,b,e,s,s,w,w,p,w,1,s,u,1
4,x,s,w,0,n,c,b,k,t,s,s,w,w,p,w,1,c,g,0


In [28]:
# Correct column names to get along with SQL better

mushroon_df = mushroom_df.rename(columns= {
    "cap-shape": "cap_shape",
    "cap-surface": "cap_surface",
    "cap-color": "cap_color",
    "gill-spacing": "gill_space",
    "gill-size": "gill_size",
    "gill-color": "gill_color",
    "stalk-shape": "stalk_shape",
    "stalk-surface-below": "stalk_surf_below",
    "stalk-surface-above": "stalk_surf_above",
    "stalk-color-below": "stalk_color_below",
    "veil-type": "veil_type",
    "veil-color": "veil_color",
    "ring-number": "ring_number",
    "class": "poisonous"
}, inplace= True)
mushroom_df.head()

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_space,gill_size,gill_color,stalk_shape,stalk_surf_above,stalk_surf_below,stalk-color-above,stalk_color_below,veil_type,veil_color,ring_number,population,habitat,poisonous
0,x,s,b,1,p,c,n,k,e,s,s,w,w,p,w,1,s,u,1
1,x,s,y,1,a,c,b,k,e,s,s,w,w,p,w,1,c,g,0
2,b,s,w,1,l,c,b,b,e,s,s,w,w,p,w,1,c,m,0
3,x,y,w,1,p,c,n,b,e,s,s,w,w,p,w,1,s,u,1
4,x,s,w,0,n,c,b,k,t,s,s,w,w,p,w,1,c,g,0


In [32]:
# Use get_dummies on only the categorical columns

exclude_columns = ["bruises", "ring_number", "poisonous"]

categoricals = [col for col in mushroom_df.columns if col not in exclude_columns]

print(categoricals)

['cap_shape', 'cap_surface', 'cap_color', 'odor', 'gill_space', 'gill_size', 'gill_color', 'stalk_shape', 'stalk_surf_above', 'stalk_surf_below', 'stalk-color-above', 'stalk_color_below', 'veil_type', 'veil_color', 'population', 'habitat']


In [33]:
dummy_dfs = [pd.get_dummies(mushroom_df[col], prefix=col) for col in categoricals]

mush_with_dummies_df = pd.concat([mushroom_df["poisonous"]] + 
                                 [mushroom_df["bruises"]] + 
                                 [mushroom_df["ring_number"]] + 
                                 dummy_dfs, axis=1)

mush_with_dummies_df.head()

Unnamed: 0,poisonous,bruises,ring_number,cap_shape_b,cap_shape_c,cap_shape_f,cap_shape_k,cap_shape_s,cap_shape_x,cap_surface_f,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,1,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,1,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [34]:
# Save data to csv to be used with any model

mush_with_dummies_df.to_csv("mush_data_binned.csv", index=False)