# 필요한 라이브러리 로딩

In [3]:
import numpy as np
import pandas as pd
from ngboost import NGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import recall_score, confusion_matrix, mean_squared_error
import warnings

In [4]:
warnings.simplefilter(action='ignore', category=FutureWarning)
%config Completer.use_jedi = False  # 자동완성 속도 증가
%config InlineBackend.figure_format = 'retina'

# 데이터 불러오기

In [5]:
mush = pd.read_csv('mushrooms.csv')
mush.shape  # 행, 열갯수 확인

(8124, 23)

# 데이터 전처리

In [6]:
mush.info()  # 결측치 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [7]:
df = pd.DataFrame(mush)  # 데이터 프레임으로 변경 후 출력
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [9]:
for i in mush.keys():
    val = mush[i].value_counts()
    print('\n{}가 가지고 있는 값의 종류 :  \n\n{}'.format(i, val))


class가 가지고 있는 값의 종류 :  

e    4208
p    3916
Name: class, dtype: int64

cap-shape가 가지고 있는 값의 종류 :  

x    3656
f    3152
k     828
b     452
s      32
c       4
Name: cap-shape, dtype: int64

cap-surface가 가지고 있는 값의 종류 :  

y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64

cap-color가 가지고 있는 값의 종류 :  

n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: cap-color, dtype: int64

bruises가 가지고 있는 값의 종류 :  

f    4748
t    3376
Name: bruises, dtype: int64

odor가 가지고 있는 값의 종류 :  

n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: odor, dtype: int64

gill-attachment가 가지고 있는 값의 종류 :  

f    7914
a     210
Name: gill-attachment, dtype: int64

gill-spacing가 가지고 있는 값의 종류 :  

c    6812
w    1312
Name: gill-spacing, dtype: int64

gill-size가 가지고 있는 값의 종류 :  

b    5612
n    2512
Name: gill-size, dtype: int64

gill-color가 가지고 있는 값의 종류 :  

b    1728
p    1492
w    1202
n    1048

In [10]:
len(df.columns)

23

In [11]:
# 다 같은 값이 들어있는 컬럼 삭제
df = df.drop('veil-type', axis=1)

In [12]:
len(df.columns)

22

In [13]:
# 라벨 인코딩
le = LabelEncoder()
new_cat = pd.DataFrame()
for i in range(0, 22):
    le.fit(df.iloc[:, i])
    le_encoded = le.transform(df.iloc[:, i])
    new_cat[df.columns[i]] = le_encoded

In [14]:
new_cat  # 라벨 인코딩 적용되었는지 확인

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,2,7,7,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,2,7,7,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,2,7,7,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,2,7,7,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,2,7,7,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,2,5,5,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,2,5,5,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,2,5,5,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,2,1,7,7,2,1,0,7,4,2


In [15]:
X = new_cat.iloc[:, 1:]  # 독립변수 추출
y = new_cat.iloc[:, 0]  # 타겟값 추출

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
X_train.shape

(6499, 21)

In [18]:
y_train.shape

(6499,)

# 모델학습 후 평가

## LGBMClassifier

### 모델 학습

In [19]:
# gbdt = gradient boosting decision tree
gbm = LGBMClassifier(max_depth=-1, application='binary', boosting_type='gbdt')
gbm.fit(X_train, y_train)

### 모델 평가

In [20]:
pred = gbm.predict(X_test)
print(accuracy_score(pred, y_test))
print(f1_score(pred, y_test))

1.0
1.0


## CatBoostClassifier

### 모델 학습

In [21]:
clf_cat = CatBoostClassifier(iterations=100, random_strength=1)
fit_model = clf_cat.fit(X_train, y_train)
print(fit_model.get_params())

Learning rate set to 0.189243
0:	learn: 0.3082760	total: 175ms	remaining: 17.3s
1:	learn: 0.1223287	total: 177ms	remaining: 8.69s
2:	learn: 0.0740286	total: 180ms	remaining: 5.82s
3:	learn: 0.0503521	total: 183ms	remaining: 4.39s
4:	learn: 0.0356191	total: 185ms	remaining: 3.52s
5:	learn: 0.0288164	total: 188ms	remaining: 2.94s
6:	learn: 0.0165908	total: 190ms	remaining: 2.53s
7:	learn: 0.0121275	total: 193ms	remaining: 2.22s
8:	learn: 0.0102235	total: 195ms	remaining: 1.97s
9:	learn: 0.0077852	total: 198ms	remaining: 1.78s
10:	learn: 0.0069568	total: 200ms	remaining: 1.62s
11:	learn: 0.0052860	total: 203ms	remaining: 1.49s
12:	learn: 0.0047260	total: 206ms	remaining: 1.38s
13:	learn: 0.0041807	total: 209ms	remaining: 1.28s
14:	learn: 0.0037511	total: 211ms	remaining: 1.2s
15:	learn: 0.0034086	total: 214ms	remaining: 1.12s
16:	learn: 0.0027974	total: 216ms	remaining: 1.05s
17:	learn: 0.0024028	total: 219ms	remaining: 997ms
18:	learn: 0.0022011	total: 222ms	remaining: 945ms
19:	learn: 0

In [22]:
cat_pred = clf_cat.predict(X_test)
train_pred = clf_cat.predict(X_train)

In [23]:
train_score = accuracy_score(y_train, train_pred)
score = accuracy_score(y_test, cat_pred)
print(train_score)
print(score)

1.0
1.0


In [24]:
f1_train = f1_score(y_train, train_pred)
f1 = f1_score(y_test, cat_pred)
print(f1_train)
print(f1)

1.0
1.0


### 피쳐 중요도 확인
- 정확도가 너무 높게 나와서 문제가 있는지 확인하기 위해서 중요도가 높은 열만 추출하기 위해서 중요도를 확인

In [25]:
print(f'{clf_cat.feature_importances_.round(1) * 10}')
# 5, 8, 11, 19 번째 피쳐가 중요도가 비교적 높음

[  1.   0.   2.  44. 562.   0.  28.  54.  16.  21.  55.  27.  21.   3.
   4.   1.  19.   9.  71.  22.  41.]


In [26]:
new_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   class                     8124 non-null   int32
 1   cap-shape                 8124 non-null   int32
 2   cap-surface               8124 non-null   int32
 3   cap-color                 8124 non-null   int32
 4   bruises                   8124 non-null   int32
 5   odor                      8124 non-null   int32
 6   gill-attachment           8124 non-null   int32
 7   gill-spacing              8124 non-null   int32
 8   gill-size                 8124 non-null   int32
 9   gill-color                8124 non-null   int32
 10  stalk-shape               8124 non-null   int32
 11  stalk-root                8124 non-null   int32
 12  stalk-surface-above-ring  8124 non-null   int32
 13  stalk-surface-below-ring  8124 non-null   int32
 14  stalk-color-above-ring    8124 non-null 

### 모델 재학습
- 중요도가 비교적 높은 'odor', 'spore-print-color' 열만 학습

#### 데이터 재분리

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    new_cat[['odor']], y, test_size=0.2)

#### 모델 학습

In [28]:
clf_cat = CatBoostClassifier(iterations=100, random_strength=1)
fit_model = clf_cat.fit(X_train, y_train)
print(fit_model.get_params())

Learning rate set to 0.189243
0:	learn: 0.5356992	total: 722us	remaining: 71.5ms
1:	learn: 0.4279840	total: 1.49ms	remaining: 72.9ms
2:	learn: 0.3497214	total: 2.1ms	remaining: 67.9ms
3:	learn: 0.2913816	total: 2.78ms	remaining: 66.7ms
4:	learn: 0.2467866	total: 3.38ms	remaining: 64.2ms
5:	learn: 0.2121320	total: 3.92ms	remaining: 61.4ms
6:	learn: 0.1851932	total: 4.47ms	remaining: 59.4ms
7:	learn: 0.1605283	total: 5.16ms	remaining: 59.4ms
8:	learn: 0.1438582	total: 5.72ms	remaining: 57.9ms
9:	learn: 0.1281380	total: 6.42ms	remaining: 57.8ms
10:	learn: 0.1178356	total: 7.11ms	remaining: 57.5ms
11:	learn: 0.1076404	total: 7.97ms	remaining: 58.4ms
12:	learn: 0.0995505	total: 8.73ms	remaining: 58.4ms
13:	learn: 0.0932090	total: 9.52ms	remaining: 58.5ms
14:	learn: 0.0881051	total: 10.4ms	remaining: 58.7ms
15:	learn: 0.0841123	total: 11.3ms	remaining: 59.3ms
16:	learn: 0.0813955	total: 12.2ms	remaining: 59.3ms
17:	learn: 0.0798676	total: 13.1ms	remaining: 59.5ms
18:	learn: 0.0781135	total: 

In [29]:
# 정확도, 정밀도, 재현율, F1_score를 보여주는 함수
def get_clf_eval(y_test, pred=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도 : {0:.3f}, 정밀도 : {1:.3f}, 재현율 : {2:.3f}, F1 : {3:.3f},\
    '.format(accuracy, precision, recall, f1))

In [30]:
train_pred = clf_cat.predict(X_train)
cat_pred = clf_cat.predict(X_test)
test_MSE = mean_squared_error(cat_pred, y_test)
print('Test MSE', test_MSE)

Test MSE 0.014153846153846154


In [31]:
print('훈련 데이터 정확도 확인')
get_clf_eval(y_train, train_pred)
print('테스트 데이터 정확도 확인')
get_clf_eval(y_test, cat_pred)

훈련 데이터 정확도 확인
오차 행렬
[[3364    0]
 [  97 3038]]
정확도 : 0.985, 정밀도 : 1.000, 재현율 : 0.969, F1 : 0.984,    
테스트 데이터 정확도 확인
오차 행렬
[[844   0]
 [ 23 758]]
정확도 : 0.986, 정밀도 : 1.000, 재현율 : 0.971, F1 : 0.985,    


In [32]:
clf_cat.predict_proba(X_test)

array([[5.56403134e-04, 9.99443597e-01],
       [1.75768638e-04, 9.99824231e-01],
       [9.65904880e-01, 3.40951199e-02],
       ...,
       [9.65904880e-01, 3.40951199e-02],
       [9.88600965e-04, 9.99011399e-01],
       [3.06487735e-04, 9.99693512e-01]])

#### 과대적합 없음 확인

### NGBoost

#### 모델 학습 후 MSE 확인

In [33]:
ngb = NGBClassifier().fit(X_train, y_train)
y_pred = ngb.predict(X_test)

# test Mean Squared Error
test_MSE = mean_squared_error(y_pred, y_test)
print('Test MSE', test_MSE)

[iter 0] loss=0.6925 val_loss=0.0000 scale=16.0000 norm=32.0000
[iter 100] loss=0.0783 val_loss=0.0000 scale=4.0000 norm=5.6292
[iter 200] loss=0.0770 val_loss=0.0000 scale=1.0000 norm=1.4533
[iter 300] loss=0.0769 val_loss=0.0000 scale=1.0000 norm=1.4712
[iter 400] loss=0.0768 val_loss=0.0000 scale=1.0000 norm=1.4782
Test MSE 0.018461538461538463


#### 모델 평가

In [34]:
print('훈련 데이터 정확도 확인')
get_clf_eval(y_train, train_pred)
print('테스트 데이터 정확도 확인')
get_clf_eval(y_test, y_pred)

훈련 데이터 정확도 확인
오차 행렬
[[3364    0]
 [  97 3038]]
정확도 : 0.985, 정밀도 : 1.000, 재현율 : 0.969, F1 : 0.984,    
테스트 데이터 정확도 확인
오차 행렬
[[844   0]
 [ 30 751]]
정확도 : 0.982, 정밀도 : 1.000, 재현율 : 0.962, F1 : 0.980,    


### XGBoost

#### 모델 학습 후 MSE 확인

In [35]:
xgb = XGBClassifier().fit(X_train, y_train)
xg_pred = xgb.predict(X_test)

test_MSE = mean_squared_error(xg_pred, y_test)
print('Test MSE', test_MSE)

Test MSE 0.014153846153846154


#### 모델 평가

In [36]:
print('훈련 데이터 정확도 확인')
get_clf_eval(y_train, train_pred)
print('테스트 데이터 정확도 확인')
get_clf_eval(y_test, xg_pred)

훈련 데이터 정확도 확인
오차 행렬
[[3364    0]
 [  97 3038]]
정확도 : 0.985, 정밀도 : 1.000, 재현율 : 0.969, F1 : 0.984,    
테스트 데이터 정확도 확인
오차 행렬
[[844   0]
 [ 23 758]]
정확도 : 0.986, 정밀도 : 1.000, 재현율 : 0.971, F1 : 0.985,    
