# SETUP
- - -

In [1]:
import pandas as pd

### Data Description

- class : edible=e, poisonous=p
- cap-shape : bell=b, conical=c,convex=x,flat=f, knobbed=k,sunken=s
- cap-surface : fibrous=f,grooves=g,scaly=y,smooth=s
- cap-color : brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
- bruises : bruises=t,no=f
- odor : almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
- gill-attachment : attached=a, descending=d, free=f, notched=n
- gill-spacing : close=c,crowded=w,distant=d
- gill-size : broad=b,narrow=n
- gill-color : black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
- stalk-shape
- stalk-root
- stalk-surface-above-ring
- stalk-surface-below-ring
- stalk-color-above-ring
- stalk-color-below-ring
- veil-type
- veil-color
- ring-number
- ring-type
- spore-print-color
- population
- habitat

In [2]:
data_raw = pd.read_csv('mushrooms.csv')
data_raw.tail()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l
8123,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,o,c,l


In [3]:
data_raw.shape

(8124, 23)

In [4]:
# Nan Value

data_raw.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [5]:
# info

data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
class                       8124 non-null object
cap-shape                   8124 non-null object
cap-surface                 8124 non-null object
cap-color                   8124 non-null object
bruises                     8124 non-null object
odor                        8124 non-null object
gill-attachment             8124 non-null object
gill-spacing                8124 non-null object
gill-size                   8124 non-null object
gill-color                  8124 non-null object
stalk-shape                 8124 non-null object
stalk-root                  8124 non-null object
stalk-surface-above-ring    8124 non-null object
stalk-surface-below-ring    8124 non-null object
stalk-color-above-ring      8124 non-null object
stalk-color-below-ring      8124 non-null object
veil-type                   8124 non-null object
veil-color                  8124 non-null object
ring-number

In [13]:
# columns

columns = data_raw.columns
columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

# Basic
- - -

### preprocessing

In [24]:
X_raw = data_raw.iloc[:, 1:]
y_raw = data_raw['class']
y_raw.tail()

8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, dtype: object

In [20]:
# whole feature after OHE (except "class")

count = []

for column in columns[1:]:
    count.append(len(data_raw[column].value_counts()))
    
sum(count)

117

In [10]:
# x_raw preprocessing (ohe)

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
X_data = encoder.fit_transform(X_raw)

In [11]:
X_data

<8124x117 sparse matrix of type '<class 'numpy.float64'>'
	with 178728 stored elements in Compressed Sparse Row format>

In [25]:
# y_raw preprocessing (1, 0)

y_data = y_raw.apply(lambda x : 1 if x == 'p' else 0)
y_data.tail()

8119    0
8120    0
8121    0
8122    1
8123    0
Name: class, dtype: int64

In [29]:
y_data.dtype, X_data.dtype

(dtype('int64'), dtype('float64'))

### Modeling

In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(criterion='entropy', max_depth=5)

In [47]:
model = tree.fit(X_data, y_data)
y_pred = model.predict(X_data)

np.mean(y_data == y_pred)

0.999507631708518

### Validation Performance

In [48]:
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=3)
result = cross_val_score(tree, X_data, y_data, scoring='accuracy', cv=cv)
result = np.mean(result)

print("Validation Performance : %3f" % (result))

Validation Performance : 0.999508
