In [9]:
from azureml.core import Workspace, Dataset

# from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


import pandas as pd
pd.set_option('max.columns', 25, 'max.rows', 150)

In [16]:
df1 = pd.read_csv('https://raw.githubusercontent.com/sannif/udacity_capstone_project/main/dataset/mushrooms.csv')

In [17]:
df1.apply(lambda x: x.nunique())

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

In [19]:
df1['gill-attachment'].value_counts()

f    7914
a     210
Name: gill-attachment, dtype: int64

In [6]:
def preprocess(df):
    # Drop columns with a single value
    df_clean = df.drop(['gill-attachment', 'veil-type'], axis=1)

    df_clean['bruises'] = df_clean['bruises'].replace({'t': 1 , 'f': 0})
    # df_clean['bruises'] = df_clean['bruises'].replace({True: 1 , False: 0})
    df_clean['gill-spacing'] = df_clean['gill-spacing'].replace({'c' : 1, 'w' : 0})
    df_clean['gill-size'] = df_clean['gill-size'].replace({'n' : 1, 'b' : 0})
    df_clean['stalk-shape'] = df_clean['stalk-shape'].replace({'e' : 1, 't' : 0})

    # df_clean['class'] = df_clean['class'].replace({"p": 1 , "e": 0})
    
    # one-hot encoding
    dummy_cols = ['cap-shape', 'cap-surface', 'cap-color', 'odor', 'gill-color', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
              'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
    df_clean = pd.get_dummies(df_clean, columns=dummy_cols)
    return df_clean

In [7]:
df_clean = preprocess(df)

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
model = RandomForestClassifier()

In [10]:
y = df_clean.pop('class')
x = df_clean

In [11]:
model.fit(x, y)

RandomForestClassifier()

In [17]:
model.feature_names = list(x.columns.values)

In [16]:
model.get_params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [10]:
ws = Workspace.from_config()

In [11]:
dataset = Dataset.get_by_name(ws, name='Mushrooms')
df = dataset.to_pandas_dataframe()

In [12]:
df.shape

(8124, 23)

In [6]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,True,p,False,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,True,a,False,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,True,l,False,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,True,p,False,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,False,n,False,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [13]:
for i in df.columns:
    print(i)
    print(df[i].value_counts(), '\n')

class
e    4208
p    3916
Name: class, dtype: int64 

cap-shape
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: cap-shape, dtype: int64 

cap-surface
y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64 

cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: cap-color, dtype: int64 

bruises
False    4748
True     3376
Name: bruises, dtype: int64 

odor
n    3528
f    2160
y     576
s     576
l     400
a     400
p     256
c     192
m      36
Name: odor, dtype: int64 

gill-attachment
False    7914
Name: gill-attachment, dtype: int64 

gill-spacing
c    6812
w    1312
Name: gill-spacing, dtype: int64 

gill-size
b    5612
n    2512
Name: gill-size, dtype: int64 

gill-color
b    1728
p    1492
w    1202
n    1048
g     752
h     732
u     492
k     408
e      96
y      86
o      64
r      24
Name: gill-color, dtype: int64 

stalk-shape
t    4608
e    3516
Name: stalk-shape, dtype: int64 



In [14]:
counts = df.apply(lambda x: x.nunique())
counts

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              1
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

In [None]:
>

In [19]:
counts[counts == 2].index

Index(['class', 'bruises', 'gill-spacing', 'gill-size', 'stalk-shape'], dtype='object')

In [55]:
# drop columns with a single unique value
df_clean = df.drop(['gill-attachment', 'veil-type'], axis=1)

df_clean['bruises'] = df_clean['bruises'].replace({True: 1 , False: 0})
df_clean['class'] = df_clean['class'].replace({"p": 1 , "e": 0})
# df_clean['gill-spacing'] = df_clean['gill-spacing'].replace({'c' : 1, 'w' : 0})
# df_clean['gill-size'] = df_clean['gill-size'].replace({'n' : 1, 'b' : 0})
# df_clean['stalk-shape'] = df_clean['stalk-shape'].replace({'e' : 1, 't' : 0})

# # one-hot encoding
# dummy_cols = ['cap-shape', 'cap-surface', 'cap-color', 'odor', 'gill-color', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
#               'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
# df_clean = pd.get_dummies(df_clean, columns=dummy_cols)

In [57]:
y = df_clean.pop('class')
X = df_clean

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=1234)

In [58]:
model = CatBoostClassifier(loss_function='CrossEntropy', eval_metric='Accuracy', iterations=100)

In [60]:
model.fit(X=X_train, y=y_train, cat_features=list(range(X.shape[1])))

0:	learn: 0.9860496	total: 6.54ms	remaining: 647ms
1:	learn: 0.9860496	total: 10.8ms	remaining: 530ms
2:	learn: 0.9860496	total: 15.4ms	remaining: 497ms
3:	learn: 0.9860496	total: 19.8ms	remaining: 476ms
4:	learn: 0.9860496	total: 23.3ms	remaining: 443ms
5:	learn: 0.9860496	total: 27.8ms	remaining: 436ms
6:	learn: 0.9860496	total: 31.8ms	remaining: 423ms
7:	learn: 0.9860496	total: 35.3ms	remaining: 406ms
8:	learn: 0.9860496	total: 39.6ms	remaining: 400ms
9:	learn: 0.9860496	total: 44.5ms	remaining: 401ms
10:	learn: 0.9860496	total: 49.5ms	remaining: 401ms
11:	learn: 0.9860496	total: 54.4ms	remaining: 399ms
12:	learn: 0.9860496	total: 59.6ms	remaining: 399ms
13:	learn: 0.9863778	total: 64.3ms	remaining: 395ms
14:	learn: 0.9860496	total: 67.7ms	remaining: 384ms
15:	learn: 0.9860496	total: 70.9ms	remaining: 372ms
16:	learn: 0.9860496	total: 74.7ms	remaining: 365ms
17:	learn: 0.9860496	total: 79ms	remaining: 360ms
18:	learn: 0.9863778	total: 84.7ms	remaining: 361ms
19:	learn: 0.9863778	tot

<catboost.core.CatBoostClassifier at 0x7f3f180d9b20>

In [61]:
y_pred = model.predict(X_test)

In [62]:
confusion_matrix(y_test, y_pred)

array([[1017,    0],
       [   2, 1012]])

In [64]:
model.score(X_test, y_test)

0.999015263417036