In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from ngboost import NGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score

import os
os.environ["OMP_NUM_THREADS"] = '1'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%config Completer.use_jedi = False
%config InlineBackend.figure_format = 'retina'

In [2]:
mush = pd.read_csv('mushrooms.csv')

In [3]:
mush.shape

(8124, 23)

In [4]:
mush.keys()

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [5]:
mush.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [6]:
df = pd.DataFrame(mush)

In [7]:
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [8]:
df = df.drop('veil-type', axis=1)

In [9]:
df.columns[21]

'habitat'

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [11]:
le = LabelEncoder()
new_cat = pd.DataFrame()
for i in range(0, 22):
    le.fit(df.iloc[:, i])
    le_encoded = le.transform(df.iloc[:, i])
    new_cat[df.columns[i]] = le_encoded

In [12]:
new_cat

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,2,7,7,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,2,7,7,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,2,7,7,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,2,7,7,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,2,7,7,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,2,5,5,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,2,5,5,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,2,5,5,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,2,1,7,7,2,1,0,7,4,2


In [13]:
a = np.array(new_cat)

In [14]:
X = a[:, 1:]

In [15]:
y = a[:, 0]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
X_train.shape

(6499, 21)

In [18]:
X_test.shape

(1625, 21)

In [19]:
y_train.shape

(6499,)

In [20]:
gbm = LGBMClassifier(max_depth=-1, application='binary', boosting_type='gbdt')
gbm.fit(X_train, y_train)

In [21]:
pred = gbm.predict(X_test)
print(accuracy_score(pred, y_test))
print(f1_score(pred, y_test))

1.0
1.0


In [22]:
gbm.feature_importances_/10  # [8], [4], [18]

array([19.2, 12.1, 27.1, 11.8, 26.9,  7.1,  7.4, 11.9, 49.4,  2.5, 17.3,
        8.5, 12. ,  3.2,  1.7,  0. ,  2.7,  0.9, 31.5, 15.7, 25.9])

In [23]:
clf_cat = CatBoostClassifier(iterations=100, random_strength=10)
fit_model = clf_cat.fit(X_train, y_train)
print(fit_model.get_params())

Learning rate set to 0.189243
0:	learn: 0.3760814	total: 175ms	remaining: 17.3s
1:	learn: 0.3025494	total: 179ms	remaining: 8.78s
2:	learn: 0.1862482	total: 182ms	remaining: 5.89s
3:	learn: 0.1286758	total: 184ms	remaining: 4.43s
4:	learn: 0.1052005	total: 187ms	remaining: 3.55s
5:	learn: 0.0828761	total: 189ms	remaining: 2.97s
6:	learn: 0.0680539	total: 192ms	remaining: 2.55s
7:	learn: 0.0590747	total: 194ms	remaining: 2.24s
8:	learn: 0.0530282	total: 197ms	remaining: 1.99s
9:	learn: 0.0485457	total: 199ms	remaining: 1.79s
10:	learn: 0.0433108	total: 201ms	remaining: 1.63s
11:	learn: 0.0361175	total: 207ms	remaining: 1.52s
12:	learn: 0.0317265	total: 211ms	remaining: 1.42s
13:	learn: 0.0299222	total: 215ms	remaining: 1.32s
14:	learn: 0.0255103	total: 220ms	remaining: 1.24s
15:	learn: 0.0213505	total: 224ms	remaining: 1.18s
16:	learn: 0.0174879	total: 228ms	remaining: 1.11s
17:	learn: 0.0127456	total: 230ms	remaining: 1.05s
18:	learn: 0.0098403	total: 233ms	remaining: 992ms
19:	learn: 

In [24]:
cat_pred = clf_cat.predict(X_test)

In [25]:
train_pred = clf_cat.predict(X_train)

In [26]:
b = pd.DataFrame(cat_pred)
c = pd.DataFrame(y_test)

In [27]:
b.value_counts()

1    827
0    798
dtype: int64

In [28]:
c.value_counts()

1    827
0    798
dtype: int64

In [29]:
score = accuracy_score(y_test, cat_pred)
print(score)

1.0


In [30]:
train_score = accuracy_score(y_train, train_pred)
print(train_score)

1.0


In [31]:
f1 = f1_score(y_test, cat_pred)
print(f1)

1.0


In [32]:
f1_train = f1_score(y_train, train_pred)
print(f1_train)

1.0


In [33]:
print(f'{clf_cat.feature_importances_}')  # [7], [4], [10]

[ 1.43699296  3.95121241  2.10566738  1.58395367 13.77307987  0.35716425
  5.99078629 17.84289913  1.75704639  5.0753892   4.4785752   9.54332072
  5.53779042  5.53344657  0.95761304  0.06246765  1.49572685  3.85845697
 11.09231036  0.98408397  2.58201671]


In [34]:
X_train, X_test, y_train, y_test = train_test_split(new_cat[['gill-color', 'odor']], y, test_size=0.2)

In [35]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [36]:
clf_cat = CatBoostClassifier(iterations=100, random_strength=10)
fit_model = clf_cat.fit(X_train, y_train)
print(fit_model.get_params())

Learning rate set to 0.189243
0:	learn: 0.5548127	total: 1.7ms	remaining: 169ms
1:	learn: 0.4484401	total: 3.34ms	remaining: 164ms
2:	learn: 0.3906095	total: 4.32ms	remaining: 140ms
3:	learn: 0.3349759	total: 5.34ms	remaining: 128ms
4:	learn: 0.2791185	total: 6.34ms	remaining: 120ms
5:	learn: 0.2437818	total: 7.23ms	remaining: 113ms
6:	learn: 0.2204822	total: 8.07ms	remaining: 107ms
7:	learn: 0.1946382	total: 10ms	remaining: 115ms
8:	learn: 0.1709136	total: 11.2ms	remaining: 114ms
9:	learn: 0.1583990	total: 12.1ms	remaining: 109ms
10:	learn: 0.1444800	total: 13.1ms	remaining: 106ms
11:	learn: 0.1359373	total: 14.2ms	remaining: 104ms
12:	learn: 0.1337331	total: 15ms	remaining: 100ms
13:	learn: 0.1255769	total: 17.2ms	remaining: 105ms
14:	learn: 0.1152406	total: 18.1ms	remaining: 103ms
15:	learn: 0.1095500	total: 19.1ms	remaining: 100ms
16:	learn: 0.1072203	total: 20ms	remaining: 97.7ms
17:	learn: 0.0994074	total: 21.2ms	remaining: 96.7ms
18:	learn: 0.0990935	total: 21.9ms	remaining: 93.

In [37]:
train_pred = clf_cat.predict(X_train)

In [38]:
cat_pred = clf_cat.predict(X_test)
b = pd.DataFrame(cat_pred)
c = pd.DataFrame(y_test)

In [39]:
train_score = accuracy_score(y_train, train_pred)
print(train_score)
f1_train = f1_score(y_train, train_pred)
print(f1_train)

0.988767502692722
0.9882277052088373


In [40]:
score = accuracy_score(y_test, cat_pred)
print(score)
f1 = f1_score(y_test, cat_pred)
print(f1)

0.9858461538461538
0.9850162866449511


In [41]:
df['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [42]:
def get_clf_eval(y_test, pred=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도 : {0:.3f}, 정밀도 : {1:.3f}, 재현율 : {2:.3f}, F1 : {3:.3f},\
    '.format(accuracy, precision, recall, f1))

In [43]:
get_clf_eval(y_test, cat_pred)

오차 행렬
[[846   0]
 [ 23 756]]
정확도 : 0.986, 정밀도 : 1.000, 재현율 : 0.970, F1 : 0.985,    


In [44]:
ngb = NGBClassifier().fit(X_train, y_train)
y_preds = ngb.predict(X_test)

# test Mean Squared Error
test_MSE = mean_squared_error(y_preds, y_test)
print('Test MSE', test_MSE)

[iter 0] loss=0.6925 val_loss=0.0000 scale=8.0000 norm=16.0000
[iter 100] loss=0.0911 val_loss=0.0000 scale=2.0000 norm=2.4692
[iter 200] loss=0.0802 val_loss=0.0000 scale=0.5000 norm=0.6166
[iter 300] loss=0.0730 val_loss=0.0000 scale=0.5000 norm=0.6264
[iter 400] loss=0.0714 val_loss=0.0000 scale=0.2500 norm=0.3166
Test MSE 0.024


In [45]:
get_clf_eval(y_test, y_preds)

오차 행렬
[[846   0]
 [ 39 740]]
정확도 : 0.976, 정밀도 : 1.000, 재현율 : 0.950, F1 : 0.974,    
