# 1. 🛀 Cleaning/EDA:

### The aim of the project is to distinguish between edible and poisonous mushrooms, and how best to achieve this classification. In order to do so, the first step is to clean our data. We then can explore it, and visualise it in order to help us find useful ways to model the data and find the best parameters to create that all important binary classification... 

## ✅ Cleaning:

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

##
from sklearn import datasets
import sklearn.metrics as sm
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


Attribute Information: (classes: edible=e, poisonous=p)

* cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

* cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

* cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

* bruises: bruises=t,no=f

* odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

* gill-attachment: attached=a,descending=d,free=f,notched=n

* gill-spacing: close=c,crowded=w,distant=d

* gill-size: broad=b,narrow=n

* gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

* stalk-shape: enlarging=e,tapering=t

* stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

* stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

* stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

* stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

* stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

* veil-type: partial=p,universal=u

* veil-color: brown=n,orange=o,white=w,yellow=y

* ring-number: none=n,one=o,two=t

* ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

* spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

* population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

* habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

In [None]:
df = pd.read_csv("mushrooms.csv")
df

In [None]:
df.dtypes

In [None]:
# breakdown of edible and poisonous mushrooms- 4208 edible and 3916 poisonous mushrooms

df['class'].value_counts()

In [None]:
# no nulls

df.isna().sum()

In [None]:
# this gives a full breakdown of each unique value in each column

for col in df.columns:
    print(col, " : ", df[col].unique())

In [None]:
# changing hyphens to underscore (for my own satisfaction)

df.columns = [x.replace('-','_') for x in df.columns]
df.columns

In [None]:
# dropping veil types as its the same for all

df.drop(['veil_type'] , axis =1, inplace=True)
df.columns

## ✅ Dictionary to rename all the keys with values:

In [None]:
# dicitionary to rename all the keys with values - comes in handy later on when we model

class_ = {"p":"poisonous","e":"edible"}
df["class"]=df["class"].replace(class_)

capshape = {"b":"bell","c":"conical","x":"convex","f":"flat","k":"knobbed","s":"sunken"} 
df["cap_shape"] = df["cap_shape"].replace(capshape)

capsurface = {"f": "fibrous", "g": "grooves","y":"scaly","s": "smooth"} 
df["cap_surface"] = df["cap_surface"].replace(capsurface)  

capcolor = {"n":"brown","b":"buff","c":"cinnamon","g":"gray","r":"green",
          "p":"pink","u":"purple","e":"red","w":"white","y":"yellow"} 
df["cap_color"] = df["cap_color"].replace(capcolor) 

bruises={"t":"bruises","f":"no"}
df["bruises"]=df["bruises"].replace(bruises)

odor = {"a":"almond","l":"anise","c":"creosote","y":"fishy","f":"foul",
      "m":"musty","n":"none","p":"pungent","s":"spicy"} 
df["odor"] = df["odor"].replace(odor) 

gillattachment = {"a":"attached","f":"free"} 
df["gill_attachment"] = df["gill_attachment"].replace(gillattachment)

gillspacing = {"c":"close","w":"crowded"} 
df["gill_spacing"] = df["gill_spacing"].replace(gillspacing) 

gillsize={"b":"broad","n":"narrow"} 
df["gill_size"] = df["gill_size"].replace(gillsize)

gillcolor={'k': 'black','n': 'brown','g': 'gray','p': 'pink', 'w': 'white', 'h': 'chocolate',
           'u': 'purple', 'e': 'red', 'b': 'buff', 'r': 'green', 'y': 'yellow', 'o': 'orange'}
df["gill_color"] = df["gill_color"].replace(gillcolor)

stalkshape = {'e':'enlargeing','t': 'tapering'}
df['stalk_shape'] = df['stalk_shape'].replace(stalkshape)

stalkroot = {'e': 'equal', 'c': 'club', 'b': 'bulbous', 'r': 'rooted', '?': 'missing'}
df['stalk_root'] = df['stalk_root'].replace(stalkroot)

stalksurfaceabovering = {'s': 'smooth', 'f' : 'fibrous', 'k': 'silky', 'y': 'scaly'}
df['stalk_surface_above_ring'] = df['stalk_surface_above_ring'].replace(stalksurfaceabovering)

stalksurfacebelowring = {'s': 'smooth', 'f' : 'fibrous', 'k': 'silky', 'y': 'scaly'}
df['stalk_surface_below_ring'] = df['stalk_surface_below_ring'].replace(stalksurfacebelowring)

stalkcolorabovering = {'n': 'brown','g': 'gray','p': 'pink', 'w': 'white',
                       'e': 'red', 'b': 'buff', 'c': 'cinammon','y': 'yellow', 'o': 'orange'}
df['stalk_color_above_ring'] = df['stalk_color_above_ring'].replace(stalkcolorabovering)

stalkcolorbelowring = {'n': 'brown','g': 'gray','p': 'pink', 'w': 'white',
                       'e': 'red', 'b': 'buff', 'c': 'cinammon','y': 'yellow', 'o': 'orange'}
df['stalk_color_below_ring'] = df['stalk_color_below_ring'].replace(stalkcolorbelowring)

veilcolor={"n":"brown","o":"orange","w":"white","y":"yellow"} 
df["veil_color"]=df["veil_color"].replace(veilcolor)

ringnumber= {"n":"none","o":"one","t":"two"}
df["ring_number"]=df["ring_number"].replace(ringnumber)

ringtype={"c":"cobwebby","e":"evanescent","f":"flaring","l":"large","n":"none","p":"pendant","s":"sheathing","z":"zone"}
df["ring_type"]=df["ring_type"].replace(ringtype)

sporeprintcolor= {"k":"black","n":"brown","b":"buff","h":"chocolate","r":"green","o":"orange","u":"purple","w":"white","y":"yellow"}
df["spore_print_color"]=df["spore_print_color"].replace(sporeprintcolor)

population={"a":"abundant","c":"clustered","n":"numerous","s":"scattered","v":"several","y":"solitary"}
df["population"]=df["population"].replace(population)

habitat={"g":"grasses","l":"leaves","m":"meadows","p":"paths","u":"urban","w":"waste","d":"woods"}
df["habitat"]=df["habitat"].replace(habitat)

## ✅ EDA:

In [None]:
# a function/for loop used here to output displots on the data

def grapher(col):
    
    x = sns.displot 
    x(data = df, x = col,  hue= 'class', height = 4, aspect = 0.7)
    plt.title(f'Graph of {col} broken down by class')
    

for key in df.columns:
    grapher(key)
    plt.xticks(rotation=90)
    plt.show()



In [None]:
# saving the cleaned version to then use in modelling

df.to_csv('cleaned_mushrooms.csv', index = False)