In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder # for encoding
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler #for standardization
from sklearn.model_selection import train_test_split
from lightgbm import plot_importance
import matplotlib.pyplot as plt

In [None]:
smoking = pd.read_csv('../data/Smoking_raw/smoking.csv')

In [None]:
def labeling(smoking):
    # 불필요한 컬럼 제거
    if 'ID' in smoking:
        df_scaling = smoking.drop("ID", axis = 1,inplace=True)
#         if len(df_scaling.columns) == 1:
#             return df_scaling
        
        # Oral(=구강검사 여부) 특성값은 모두 Y 값이므로 삭제.
        df_scaling = smoking.drop('oral', axis = 1) 
        
        # 범주형 피처 레이블 인코딩 
        cate_features = smoking[['gender','tartar']]

        lbe = LabelEncoder()
        lbe.fit_transform(smoking["gender"])
        df_scaling["gender"] = lbe.fit_transform(smoking["gender"])

        lbe = LabelEncoder()
        lbe.fit_transform(smoking["tartar"])
        df_scaling["tartar"] = lbe.fit_transform(smoking["tartar"])
    
    return df_scaling

In [None]:
df_scaling = labeling(smoking)

In [None]:
cate_features = df_scaling[['gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']]
scaled_features = df_scaling.drop(cate_features.columns, axis=1)

In [None]:
target = df_scaling['smoking']

In [None]:
scaler = StandardScaler()
scaler.fit(scaled_features)
standard_scaled = pd.DataFrame(df_scaling,columns = scaled_features.columns)
standard_scaled[cate_features.columns] = cate_features

In [None]:
scaled_data = standard_scaled
scaled_data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_data, target, test_size = 0.1, random_state=0)

In [None]:
# scaled = scaler.transform(scaled_features)

In [None]:
from lightgbm import LGBMClassifier
lgbm_wrapper = LGBMClassifier(n_estimators=400)
evals = [(x_test,y_test)]

# 학습 : 조기중단 수행(100)
lgbm_wrapper.fit(x_train,y_train,early_stopping_rounds=100,
                eval_metric='logloss',eval_set=evals, verbose=True)

# 예측
preds = lgbm_wrapper.predict(x_test)
pred_proba = lgbm_wrapper.predict_proba(x_test)[:, 1]

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test,pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
get_clf_eval(y_test, preds, pred_proba)

In [None]:
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(lgbm_wrapper, ax=ax,importance_type='split')