In [277]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import os
os.environ["OMP_NUM_THREADS"] = '1'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%config Completer.use_jedi = False
%config InlineBackend.figure_format = 'retina'

In [2]:
mush = pd.read_csv('mushrooms.csv')

In [3]:
mush.shape

(8124, 23)

In [4]:
mush.keys()

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [5]:
mush.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [6]:
df = pd.DataFrame(mush)

In [307]:
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,s,w,w,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,s,w,w,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,s,w,w,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,s,w,w,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,s,w,w,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,s,o,o,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,s,o,o,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,s,o,o,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,s,k,w,w,w,o,e,w,v,l


In [8]:
df = df.drop('veil-type', axis=1)

In [9]:
df.columns[21]

'habitat'

In [308]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [10]:
le = LabelEncoder()
new_cat = pd.DataFrame()
for i in range(0, 22):
    le.fit(df.iloc[:, i])
    le_encoded = le.transform(df.iloc[:, i])
    new_cat[df.columns[i]] = le_encoded

In [11]:
new_cat

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,2,7,7,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,2,7,7,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,2,7,7,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,2,7,7,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,2,7,7,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,2,5,5,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,2,5,5,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,2,5,5,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,2,1,7,7,2,1,0,7,4,2


In [65]:
a = np.array(new_cat)

In [69]:
X = a[:, 1:]

In [70]:
y = a[:, 0]

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [205]:
X_train.shape

(6499, 21)

In [206]:
X_test.shape

(1625, 21)

In [207]:
y_train.shape

(6499,)

In [287]:
gbm = LGBMClassifier(max_depth=-1, application='binary', boosting_type='gbdt')
gbm.fit(X_train, y_train)

In [289]:
pred = gbm.predict(X_test)
print(accuracy_score(pred, y_test))
print(f1_score(pred, y_test))

1.0
1.0


In [300]:
gbm.feature_importances_/10  # [8], [4], [18]

array([12.4, 11.8, 20.9,  3.7, 40.8,  7. ,  9.9, 15.3, 53.7,  2.8, 17.3,
        0.4, 17.7,  4. ,  1.9,  0.1,  2.9,  3.7, 31.2, 16.6, 20.8])

In [247]:
clf_cat = CatBoostClassifier(iterations=100, random_strength=10)
fit_model = clf_cat.fit(X_train, y_train)
print(fit_model.get_params())

Learning rate set to 0.189243
0:	learn: 0.3767471	total: 3.25ms	remaining: 322ms
1:	learn: 0.3038138	total: 6.41ms	remaining: 314ms
2:	learn: 0.1869815	total: 8.86ms	remaining: 286ms
3:	learn: 0.1277612	total: 11.5ms	remaining: 276ms
4:	learn: 0.0904214	total: 13.8ms	remaining: 262ms
5:	learn: 0.0659019	total: 16.1ms	remaining: 252ms
6:	learn: 0.0591043	total: 18.2ms	remaining: 242ms
7:	learn: 0.0425306	total: 20.4ms	remaining: 235ms
8:	learn: 0.0348307	total: 22.9ms	remaining: 231ms
9:	learn: 0.0321197	total: 25.7ms	remaining: 232ms
10:	learn: 0.0317485	total: 27.5ms	remaining: 222ms
11:	learn: 0.0283975	total: 29.8ms	remaining: 219ms
12:	learn: 0.0273432	total: 32ms	remaining: 214ms
13:	learn: 0.0255046	total: 34.2ms	remaining: 210ms
14:	learn: 0.0214671	total: 36.8ms	remaining: 208ms
15:	learn: 0.0185867	total: 39.5ms	remaining: 207ms
16:	learn: 0.0176213	total: 41.6ms	remaining: 203ms
17:	learn: 0.0164684	total: 43.8ms	remaining: 199ms
18:	learn: 0.0146061	total: 45.9ms	remaining: 

In [229]:
cat_pred = clf_cat.predict(X_test)

In [230]:
train_pred = clf_cat.predict(X_train)

In [231]:
b = pd.DataFrame(cat_pred)
c = pd.DataFrame(y_test)

In [232]:
b.value_counts()

0    845
1    780
dtype: int64

In [233]:
c.value_counts()

0    845
1    780
dtype: int64

In [234]:
score = accuracy_score(y_test, cat_pred)
print(score)

1.0


In [235]:
train_score = accuracy_score(y_train, train_pred)
print(train_score)

1.0


In [236]:
f1 = f1_score(y_test, cat_pred)
print(f1)

1.0


In [237]:
f1_train = f1_score(y_train, train_pred)
print(f1_train)

1.0


In [301]:
print(f'{clf_cat.feature_importances_}')  # [7], [4], [10]

[ 0.54580421  4.04570181  0.96774168  3.75218123 12.91022477  0.05825597
  5.93623535 15.26892759  1.04873334  7.2106307  11.55055153  8.98459037
  4.13290232  4.93882161  0.37246438  0.23646017  0.79457511  2.89044579
  9.64947593  1.35661036  3.34866576]


In [340]:
X_train, X_test, y_train, y_test = train_test_split(new_cat[['gill-color', 'odor', 'stalk-root']], y, test_size=0.2)

In [341]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [342]:
clf_cat = CatBoostClassifier(iterations=100, random_strength=10)
fit_model = clf_cat.fit(X_train, y_train)
print(fit_model.get_params())

Learning rate set to 0.189243
0:	learn: 0.5680344	total: 1.83ms	remaining: 181ms
1:	learn: 0.4675894	total: 4.08ms	remaining: 200ms
2:	learn: 0.3942756	total: 5.59ms	remaining: 181ms
3:	learn: 0.3604342	total: 7.05ms	remaining: 169ms
4:	learn: 0.2999586	total: 8.76ms	remaining: 167ms
5:	learn: 0.2557031	total: 10ms	remaining: 157ms
6:	learn: 0.2286213	total: 11.5ms	remaining: 152ms
7:	learn: 0.2043221	total: 13.3ms	remaining: 153ms
8:	learn: 0.1922642	total: 15.1ms	remaining: 152ms
9:	learn: 0.1728460	total: 17.5ms	remaining: 157ms
10:	learn: 0.1545532	total: 19.8ms	remaining: 160ms
11:	learn: 0.1373235	total: 21.8ms	remaining: 160ms
12:	learn: 0.1329662	total: 23.7ms	remaining: 159ms
13:	learn: 0.1193569	total: 25.7ms	remaining: 158ms
14:	learn: 0.1121772	total: 27.9ms	remaining: 158ms
15:	learn: 0.1028023	total: 30ms	remaining: 158ms
16:	learn: 0.0984454	total: 32.7ms	remaining: 160ms
17:	learn: 0.0924447	total: 34.6ms	remaining: 158ms
18:	learn: 0.0901721	total: 35.9ms	remaining: 15

In [343]:
cat_pred = clf_cat.predict(X_test)
b = pd.DataFrame(cat_pred)
c = pd.DataFrame(y_test)

In [344]:
score = accuracy_score(y_test, cat_pred)
print(score)
f1 = f1_score(y_test, cat_pred)
print(f1)

0.9907692307692307
0.9904882688649335


In [345]:
train_pred = clf_cat.predict(X_train)

In [346]:
train_score = accuracy_score(y_train, train_pred)
print(train_score)
f1_train = f1_score(y_train, train_pred)
print(f1_train)

0.9924603785197723
0.9920852850912615
