In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder # for encoding
from sklearn.preprocessing import MinMaxScaler, StandardScaler #for standardization

In [None]:
smoking = pd.read_csv('../data/smoking.csv')

In [None]:
smoking.info()
# 범주형 데이터 : 'gender', 'smoking', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries'

In [None]:
smoking.head()

In [None]:
smoking.isnull().sum()

In [None]:
df = smoking.drop("ID", axis = 1)
df

In [None]:
df = df.drop('oral',axis=1) # Oral(=구강검사 여부) 특성값은 모두 Y 값이므로 삭제.
df

In [None]:
labels = LabelEncoder()
labels.fit_transform(df["gender"])
df["gender"] = labels.fit_transform(df["gender"])

In [None]:
labels = LabelEncoder()
labels.fit_transform(df["tartar"])
df["tartar"] = labels.fit_transform(df["tartar"])

In [None]:
df

In [None]:
cate_features = df[['gender', 'smoking', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']] # 레이블 인코딩 완료

In [None]:
df.columns

In [None]:
# 기초 신체 사항과 흡연의 상관관계
cat1 = df.loc[:,'gender':'hearing(right)']
cat1['smoking'] = df['smoking']
cat1.corr()['smoking']

In [None]:
# 심혈관 및 호흡계 특성과의 상관관계
cat2 = df.loc[:,'systolic':'hemoglobin']
cat2['smoking'] = df['smoking']
cat2.corr()['smoking']

In [None]:
# 신장계 특성과 흡연의 상관관계
cat3 = df.loc[:,'Urine protein':'serum creatinine']
cat3['smoking'] = df['smoking']
cat3.corr()['smoking']

In [None]:
# 간 수치 특성과 흡연의 상관관계
cat4 = df.loc[:,'AST':'Gtp']
cat4['smoking'] = df['smoking']
cat4.corr()['smoking']

In [None]:
# 구강 상태 특성과 흡연의 상관관계
cat5 = df.loc[:,'dental caries':]
cat5.corr()['smoking']

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

In [None]:
# select dependent variable (label)
y = df["smoking"]
# select independent variable (estimator)
x = cat1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.10,
                                                    shuffle = True,
                                                    random_state = 1)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
lgbm_wrapper = LGBMClassifier(n_estimators=400)
evals = [(X_test,y_test)]

# 학습 : 조기중단 수행(100)
lgbm_wrapper.fit(X_train,y_train,early_stopping_rounds=100,
                eval_metric='logloss',eval_set=evals, verbose=True)

# 예측
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
get_clf_eval(y_test, preds, pred_proba)

In [None]:
# plot_importance( )를 이용하여 feature 중요도 시각화
from lightgbm import plot_importance
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(lgbm_wrapper, ax=ax,importance_type='split')

In [None]:
cat1.columns

In [None]:
# select dependent variable (label)
y = df["smoking"]
# select independent variable (estimator)
x = cat2

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.10,
                                                    shuffle = True,
                                                    random_state = 1)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

lgbm_wrapper = LGBMClassifier(n_estimators=400)
evals = [(X_test,y_test)]

# 학습 : 조기중단 수행(100)
lgbm_wrapper.fit(X_train,y_train,early_stopping_rounds=100,
                eval_metric='logloss',eval_set=evals, verbose=True)

# 예측
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

In [None]:
get_clf_eval(y_test, preds, pred_proba)

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(lgbm_wrapper, ax=ax,importance_type='split')

In [None]:
# select dependent variable (label)
y = df["smoking"]
# select independent variable (estimator)
x = cat3

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.10,
                                                    shuffle = True,
                                                    random_state = 1)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

lgbm_wrapper = LGBMClassifier(n_estimators=400)
evals = [(X_test,y_test)]

# 학습 : 조기중단 수행(100)
lgbm_wrapper.fit(X_train,y_train,early_stopping_rounds=100,
                eval_metric='logloss',eval_set=evals, verbose=True)

# 예측
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

In [None]:
get_clf_eval(y_test, preds, pred_proba)

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(lgbm_wrapper, ax=ax,importance_type='split')

In [None]:
# select dependent variable (label)
y = df["smoking"]
# select independent variable (estimator)
x = cat4

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.10,
                                                    shuffle = True,
                                                    random_state = 1)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

lgbm_wrapper = LGBMClassifier(n_estimators=400)
evals = [(X_test,y_test)]

# 학습 : 조기중단 수행(100)
lgbm_wrapper.fit(X_train,y_train,early_stopping_rounds=100,
                eval_metric='logloss',eval_set=evals, verbose=True)

# 예측
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

In [None]:
get_clf_eval(y_test, preds, pred_proba)

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(lgbm_wrapper, ax=ax,importance_type='split')

In [None]:
# select dependent variable (label)
y = df["smoking"]
# select independent variable (estimator)
x = cat5

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.10,
                                                    shuffle = True,
                                                    random_state = 1)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

lgbm_wrapper = LGBMClassifier(n_estimators=400)
evals = [(X_test,y_test)]

# 학습 : 조기중단 수행(100)
lgbm_wrapper.fit(X_train,y_train,early_stopping_rounds=100,
                eval_metric='logloss',eval_set=evals, verbose=True)

# 예측
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

In [None]:
get_clf_eval(y_test, preds, pred_proba)

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(lgbm_wrapper, ax=ax,importance_type='split')