# Data Munging


In [None]:
# Data Munging
# Importing libraries 
import numpy as np 
import pandas as pd
import janitor

# !pip3 install pyjanitor # run this once

import sklearn 
from sklearn.impute import KNNImputer
from sklearn import preprocessing
data = pd.read_csv('../data/mushrooms.csv')
data.info() # We want to see the data types and how many null values do we have in the dataset.  
data.columns # We want to clean column names automatically by replacing each - with _
data = data.clean_names()
data.columns # Check column names after automatic cleaning
data.describe()
# We want to see what different values each column contains
# From here, we can see that the veil_type has one single value and therefore is redundant and not informative so we can proceed with dropping it 
data.columns.tolist()
for col in data.columns.tolist(): 
    print(col,':  ',data[col].unique())
data.drop('veil_type', axis = 1, inplace = True)
# We can see that columns have multiple repetitive values (letters), even though they mean different things in each column. 
# This might cause some problems like duplications when we create dummy variables for these categorical values. 

# We can also see that the column 'stalk_root' has a non-alphanumeric value and it might need some munging. 
# According to the dataset's documentation, the value '?' in stalk_root means that they are missing or unknown stalk root data. 
# Let's see how many of these missing values we have to decide if it'd be okay to drop these rows. 

vals = data['stalk_root'].value_counts().index.values.tolist()

NA_count = data['stalk_root'].value_counts().values

NA_frac = data['stalk_root'].value_counts().to_list()
NA_frac = [i/sum(NA_frac) for i in NA_frac]

pd.DataFrame(zip(NA_count,NA_frac), columns=['Count','Fraction'], index= vals)

# So, now we can see that if we drop the missing values in this column we're losing 30% of our data which accounts for about 2500 instances. 
# Dropping the rows is not the best solution in this case. 
# Therefore, we'll try to impute using KNN.
# Before that, the categorical value must be numerically encoded/labelled from 0 to n. 


# ['population', 'cap_shape', 'stalk_shape', 'stalk_surface_above_ring', 'stalk_surface_below_ring', 'stalk_color_below_ring']
# Let's see the order of values in this column
data.stalk_root.unique()
le = preprocessing.LabelEncoder()

for i in data.columns.tolist():
    data[i]= le.fit_transform(data[i])

data
# Order of values after encoding 
data['stalk_root'].unique()
# But, for the models to impute the missing data, we should replace each 4 with a NaN. 
data.replace({'stalk_root': {0: np.nan}}).stalk_root.value_counts()
imputer = KNNImputer(missing_values = np.nan, n_neighbors=5, weights = 'distance')
imputer.fit_transform(data[['stalk_root']])
data.stalk_root.value_counts()
# We can see that KNNImputer didn't give us any useful results and we're again back on square 1. Therefore, we'll just drop the column. 


### <font color='green'>Sara-EDA</font> 

In [None]:
# Habitat vs class

cross_tab_prop = pd.crosstab(index=data['habitat'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("Habitat")
plt.ylabel("Proportion")
plt.title('Veil Color by the Class')



for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")

plt.show() # last bar + ticks

In [None]:
# Population vs class 

cross_tab_prop = pd.crosstab(index=data['population'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("Population")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")

plt.show() # last bar + ticks

In [None]:
# Odor vs class

cross_tab_prop = pd.crosstab(index=data['odor'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("Odor")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")

plt.show() # last bar + ticks

In [None]:
# Spore print color vs class

cross_tab_prop = pd.crosstab(index=data['spore_print_color'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("Spore Print Color")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")

plt.show() # last bar + ticks

In [None]:
# Bruised vs class

cross_tab_prop = pd.crosstab(index=data['bruises'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("Bruised")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")

plt.show() # last bar + ticks

In [None]:
# Stalk root vs habitat

hm = data.groupby(['habitat','stalk_root'], as_index = False)[['class']].count()
hm = hm.pivot('stalk_root', 'habitat', 'class')
hm.fillna(0)
ax = sns.heatmap(hm, cmap="YlGnBu")

In [None]:
# Stalk shape vs habitat

hm = data.groupby(['habitat','stalk_shape'], as_index = False)[['class']].count()
hm = hm.pivot('stalk_shape','habitat','class')
hm.fillna(0)
ax = sns.heatmap(hm, cmap="YlGnBu")

In [None]:
#Cap shape vs habitat 

hm = data.groupby(['habitat','cap_shape'], as_index = False)[['class']].count()
hm = hm.pivot('cap_shape','habitat','class')
hm.fillna(0)
ax = sns.heatmap(hm, cmap="YlGnBu")

In [None]:
# Cap surface vs habitat 

hm = data.groupby(['habitat','cap_surface'], as_index = False)[['class']].count()
hm = hm.pivot('cap_surface','habitat','class')
hm.fillna(0)
ax = sns.heatmap(hm, cmap="YlGnBu")

In [None]:
# Cap color vs habitat 

hm = data.groupby(['habitat','cap_color'], as_index = False)[['class']].count()
hm = hm.pivot('cap_color','habitat','class')
hm.fillna(0)
ax = sns.heatmap(hm, cmap="YlGnBu")

In [None]:
# Veil color vs habitat

hm = data.groupby(['habitat','veil_color'], as_index = False)[['class']].count()
hm = hm.pivot('veil_color','habitat','class')
hm.fillna(0)
ax = sns.heatmap(hm, cmap="YlGnBu")

In [None]:
# Ring number vs habitat

hm = data.groupby(['habitat','ring_number'], as_index = False)[['class']].count()
hm = hm.pivot('ring_number','habitat','class')
hm.fillna(0)
ax = sns.heatmap(hm, cmap="YlGnBu")

### <font color='green'>Mohammed-EDA</font>

In [None]:
df = pd.read_csv('../data/mushroom_clean.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x = 'veil_color' , hue = 'class' , data = df )
plt.title('Veil Color by the Class')
plt.xlabel('Veil Color')
plt.ylabel('No. Of Mushroom');

From the graph, we can see that the Veil Color can be used to know the class of the mushroom except when the color is white.

ndf = df.loc[data['veil_color'] == 'w']
fig, ax = plt.subplots(6, 4, figsize=(30, 20))
for variable, subplot in zip(ndf.columns.tolist(), ax.flatten()):
    sns.countplot(x = variable, hue = 'class' ,  data = ndf,  ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90); 

We did more analysis to know if there is a feature that can be used to know the class if the color is white. We noticed that the possible features are the smell & population & Stalk Color & Ring Type.

In [None]:
selected_rows = ['population','ring_type','stalk_color_below_ring','stalk_color_above_ring']
fig, ax = plt.subplots(2, 2, figsize=(30, 20))
for variable, subplot in zip(selected_rows, ax.flatten()):
    sns.countplot(x = variable, hue = 'class' ,  data = ndf,  ax=subplot)
    plt.title('Veil Color by the Class')
    plt.xlabel('Veil Color')
    plt.ylabel('No. Of Mushroom');

here we can see the four fetures more clearly and notice that if the population abundant or numerous then it is (e). For the ring type if its is large the it is from type (p). Stalk color below the ring if it is buff and high chance for the brown and cinnamon. 

### <font color='green'>Dina-EDA</font>

In [None]:
labels = ['Edible', 'Poisonous']
values = [data.describe()['class']['freq'], data.describe()['class']['count']-data.describe()['class']['freq']]
colors = ['green', 'pink']

In [None]:
plt.figure(figsize=(15, 8))
splot = sns.countplot(data=data, x='gill-attachment',
                      hue='class',
                      order=data['gill-attachment'].value_counts().index,
                      palette=['pink', 'forestgreen'])                      
splot.set_xticklabels(['free', 'attached'])

for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 9),
                   textcoords='offset points')
plt.legend(['Poisonous', 'Edible'], loc='upper right')
plt.ylabel('Number of the Mushrooms', fontsize=13)
plt.xlabel('Types of the Gill Attachments', fontsize=13)
plt.title('Mushrooms Classes vs Gill Attachments', fontsize=15);

According to this graph, the Free type gill attachments are approximately equally distributed. In contrast, the Attached type gill attachments are most frequently edible.

In [None]:
plt.figure(figsize=(15, 8))
splot = sns.countplot(data=data, x='gill-spacing',
                      hue='class',
                      order=data['gill-spacing'].value_counts().index,
                      palette=['pink', 'forestgreen'])                   
splot.set_xticklabels(['Close', 'Crowded'])
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 9),
                   textcoords='offset points')
plt.legend(['Poisonous', 'Edible'], loc='upper right')
plt.ylabel('Number of the Mushrooms', fontsize=13)
plt.xlabel('Types of the Gill Spacing', fontsize=13)
plt.title('Mushrooms Classes vs Gill Spacing', fontsize=15);

According to this figure, when the Crowded type gill spacing is high it is frequently edible.however, we cannot say the same thing for the Close-type gill spacing mushrooms. Their class distribution by gill spacing is pretty close to each other.

In [None]:
plt.figure(figsize=(15, 8))
splot = sns.countplot(data=data, x='gill-color',
                      hue='class',
                      order=data['gill-color'].value_counts().index,
                      palette=['pink', 'forestgreen'])
                      
splot.set_xticklabels(['Buff', 'Pink', 'White', 'Brown', 'Gray', 'Chocolate', 'Purple', 'Black', 'Red',
                       'Yellow', 'Orange', 'Green'])
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 9),
                   textcoords='offset points')
plt.legend(['Poisonous', 'Edible'], loc='upper right')
plt.ylabel('Number of the Mushrooms', fontsize=13)
plt.xlabel('The Gill Colors of the Mushrooms', fontsize=13)
plt.title('Mushrooms vs Gill Colors', fontsize=15)

As we can see from the figure above, the Buff-colored gills are highly poisonous. In contrast, the White, Brown, Purple, Black, and Red gill colors are high frequently edible.

### <font color='green'>Amal-EDA</font>

In [None]:
l=data['cap_shape'].value_counts()
dic={5:'sunken',2:'convex',3:'flat',0:'bell',4:'knobbed',1:'conical'}
plt.bar(list(dic.values()),l)


In [None]:
l1=data['cap_surface'].value_counts()
dic1 ={3:'smooth',2:'scaly',0:'fibrous',1:'grooves'}
plt.bar(list(dic1.values()),l1)


In [None]:
data['cap_color'].value_counts()
dic={0:'brown',1:'buff',2:'cinnamon',3:'gray',4:'green',5:'pink',6:'purple',7:'red',8:'white',9:'yellow'}
data['cap_color']=data['cap_color'].replace(dic)
ax=sns.histplot(data['cap_color'],bins=20)
plt.xticks(rotation=45)


In [None]:
# def plot_col(col, hue=None, color=['blue', 'purple'], labels=None):
#     fig, ax = plt.subplots(figsize=(15, 7))
#     sns.countplot(col, hue=hue, palette=color, saturation=0.6, data=data, dodge=True, ax=ax)
#     ax.set(title = f"Mushroom {col.title()} Quantity", xlabel=f"{col.title()}", ylabel="Quantity")
#     if labels!=None:
#         ax.set_xticklabels(labels)
#     if hue!=None:
#         ax.legend(('Poisonous', 'Edible'), loc=0)

In [None]:
# color_dict = {"purple":"n","yellow":"y", "blue":"w", "violet":"g", "red":"e","pink":"p",
#               "orange":"b", "purple":"u", "black":"c", "green":"r"}
# plot_col(col='cap_color', color=color_dict.keys(), labels=color_dict)

### <font color='green'>Omar-EDA</font>

**Is there a relation between stalk properties and edibility ?**

In [None]:
# Stalk_shape vs class

cross_tab_prop = pd.crosstab(index=data['stalk_shape'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("Stalk_shape")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")


plt.show() # last bar + ticks

### Insight: 
   ##### We see that 54.0% of mushrooms with __enlarging__ stalk shape is poisonuos, and 43.8% of mushrooms with __tapering__ stalk shape is poisonuos.

In [None]:
# Stalk_root vs class

cross_tab_prop = pd.crosstab(index=data['stalk_root'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("Stalk_root")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")


plt.show() # last bar + ticks

### Insight: 
   ##### We see that if the mushroom has a __club__, __cup__, or __rooted__ stalk root, then there is a low chance for it to be poisonous. However, if the root is __bulbous__ then a chance of 49.2% is that the mushroom is poisonous. Moreover, in the dataset we have 2480 mushrooms with a __missing__ stalk root.


In [None]:
# Stalk_surface vs class

# stalk_surface_above_ring

cross_tab_prop = pd.crosstab(index=data['stalk_surface_above_ring'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("stalk surface above ring")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")

# stalk_surface_below ring

cross_tab_prop = pd.crosstab(index=data['stalk_surface_below_ring'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("stalk surface below ring")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")


plt.show() # last bar + ticks


### Insight: 
   ##### For the stalk surface, we see that the percentage are similar for above & below ring. __Fibrous__, __smooth__, and __scaly__ mushorooms have an average possibility of being poisonous as 25.1%, 30.4%, and 30.1%, respectively. However, __silky__ mushrooms have an average possibility of 93.9% for being poisonuous.
   #### The dataset contains:
      1- 2372 mushrooms with silky stalk surface above ring.
      2- 2304 mushrooms with silky stalk surface below ring.
      3- 1800 mushrooms with silky stalk surface above and below ring.


In [None]:
da2 = data[(data['stalk_surface_above_ring'] == 'k') & (data['stalk_surface_below_ring'] == 'k')]
da2.shape[0]

In [None]:
# Stalk_color vs class

# stalk_color_above_ring

cross_tab_prop = pd.crosstab(index=data['stalk_color_above_ring'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("stalk color above ring")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")

# stalk_surface_below ring

cross_tab_prop = pd.crosstab(index=data['stalk_color_below_ring'],
                             columns=data['class'],
                             normalize="index")

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab10', 
                    figsize=(10, 6))

plt.legend(loc="lower left", ncol=2)
plt.xlabel("stalk color below ring")
plt.ylabel("Proportion")


for n, x in enumerate([*cross_tab_prop.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop.loc[x],
                                   cross_tab_prop.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")


plt.show() # last bar + ticks


### Insight: 
   ##### For the stalk color, we see that the percentage are similar for above & below ring. Mushrooms with these stalk colors: __buff__, __cinnamon__, and __yellow__ either above or below ring, have 100% percentage of being poisonous. On the other hand, mushrooms with these stalk colors: __red__, __gray__, and __orange__ either above or below ring, have 0% percentage of being poisonous. Also, 38.3% of white colored stalk mushrooms are poisonous, and 69.2% of pink colored stalk mushrooms are poisonous, and brown colored stalk mushrooms have an average percentage of 92.0% for being poisonous.
   ##### Notes: in this dataset,
      1- There are only 8 yellow, 36 cinnamon for stalk color above the ring.
      2- There are only 24 yellow, 36 cinnamon for stalk color below the ring.
      3- Also, the 36 cinnamon stalk colored mushrooms have both cinnamon color above and below ring.
