In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
columns = ['target', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 
           'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape',
           'stalk-root', 'stalk-surface-above-ring', 'stack-surface-below-ring', 
           'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color',
           'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 
                 names=columns)

In [4]:
df.head()

Unnamed: 0,target,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stack-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
# Selecting target, odor and gill-color to make predictions
df_subset = df[['target', 'odor', 'gill-color']]

In [6]:
df_subset.head()

Unnamed: 0,target,odor,gill-color
0,p,p,k
1,e,a,k
2,e,l,n
3,p,p,n
4,e,n,k


In [7]:
# Replace 'p' with 1 and 'e' with 0
df_subset['target'].replace({'p': 1, 'e': 0}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [7]:
df_subset.head()

Unnamed: 0,target,odor,gill-color
0,1,p,k
1,0,a,k
2,0,l,n
3,1,p,n
4,0,n,k


## Exploratory Data Analysis
- Distribution of variables

## 51.8% of mushrooms in the dataset are edible, 48.2% are poisonous

In [9]:
df_subset['target'].value_counts(normalize=True)

0    0.517971
1    0.482029
Name: target, dtype: float64

## 43.4% of mushrooms in the dataset have no odor, 26.5% have foul odor, 7.09% have fishy odor, etc.
### odor: a = almond, l = anise, c = creosote, y = fishy, f = foul, m = musty, n = none, p = pungent, s = spicy

In [9]:
df_subset['odor'].value_counts(normalize=True)

n    0.434269
f    0.265879
y    0.070901
s    0.070901
l    0.049237
a    0.049237
p    0.031512
c    0.023634
m    0.004431
Name: odor, dtype: float64

## 21.3% of mushrooms in the dataaset are buff, 18.4% are pink, 14.8% are white, etc.
### gill-color: k = black, n = brown, b = buff, h = chocolate, g = gray, r = green, o = orange, p = pink, 
### u = purple, e = red, w = white, y = yellow 

In [10]:
df_subset['gill-color'].value_counts(normalize=True)

b    0.212703
p    0.183653
w    0.147957
n    0.129000
g    0.092565
h    0.090103
u    0.060561
k    0.050222
e    0.011817
y    0.010586
o    0.007878
r    0.002954
Name: gill-color, dtype: float64

## Here we can see that odor appears to correlate to whether the mushroom poisonous or edible.
### For example, all of the mushrooms with odor "f" (foul) are poisonous. Conversely, all mushrooms with 
### odor "a" (almond) are edible.

In [12]:
df_subset.groupby(['odor', 'target']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,gill-color
odor,target,Unnamed: 2_level_1
a,0,400
c,1,192
f,1,2160
l,0,400
m,1,36
n,0,3408
n,1,120
p,1,256
s,1,576
y,1,576


## Here we can see that gill-color often correlates to whether the mushroom poisonous or edible.
### For example, all of the mushrooms with gill color "b" (buff) are poisonous. Conversely, all mushrooms with 
### color "e" (red) are edible. Some of the other variables aren't as great predictors.

In [13]:

df_subset.groupby(['gill-color', 'target']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,odor
gill-color,target,Unnamed: 2_level_1
b,1,1728
e,0,96
g,0,248
g,1,504
h,0,204
h,1,528
k,0,344
k,1,64
n,0,936
n,1,112
