In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder # for encoding
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler #for standardization
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import warnings
warnings.filterwarnings('ignore')

In [None]:
smoking = pd.read_csv('../data/Smoking_raw/smoking.csv')
smoking

In [None]:
def preprocessing(df_scaling):
    # 불필요한 컬럼 제거
    if 'ID' in df_scaling:
        df_scaling = df_scaling.drop("ID", axis = 1)
        if len(df_scaling.columns) == 1:
            return df_scaling
        
        # Oral(=구강검사 여부) 특성값은 모두 Y 값이므로 삭제.
        df_scaling = df_scaling.drop('oral', axis = 1) 
        
        # 범주형 피처 레이블 인코딩 
        cate_features = df_scaling[['gender','tartar']]

        lbe = LabelEncoder()
        lbe.fit_transform(df_scaling["gender"])
        df_scaling["gender"] = lbe.fit_transform(df_scaling["gender"])

        lbe = LabelEncoder()
        lbe.fit_transform(df_scaling["tartar"])
        df_scaling["tartar"] = lbe.fit_transform(df_scaling["tartar"])

        # 형태별 특성 정규화 작업
        cate_features = df_scaling[['gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']]
        scaled_features = df_scaling.drop(cate_features.columns, axis=1)

    df = pd.concat([cate_features,scaled_features],axis=1)

    return df

In [None]:
df = preprocessing(smoking)
df

In [None]:
df.info()

In [None]:
target = df['smoking']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df,target,test_size=0.2,shuffle=True,random_state=0)

In [None]:
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
import time
from xgboost import XGBClassifier
start_time = time.time()
xgb_model = XGBClassifier(n_estimators = 2000)
xgb_model.fit(x_train, y_train)
print("XGB 수행 시간: {0:.1f} 초 ".format(time.time() - start_time))
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
x_tr, x_val, y_tr, y_val = train_test_split(x_train,y_train,test_size=0.1,random_state=0)
print(x_train.shape, x_test.shape)
print(x_tr.shape, x_val.shape)

In [None]:
dtr = xgb.DMatrix(data=x_tr,label=y_tr)
dval = xgb.DMatrix(data=x_val,label=y_val)
dtest = xgb.DMatrix(data=x_test,label=y_test)

In [None]:
# 파라미터 설정
params = { 'max_depth':3,
           'eta': 0.1,  # learning_rate
           'objective':'binary:logistic', 
           'eval_metric':'logloss',
          'silent' : 0,
           'verbosity':0
        }
num_rounds = 400

In [None]:
eval_list = [(dtr,'train'),(dval,'eval') ] # eval 평가용

xgb_model = xgb.train(params=params,dtrain=dtr,num_boost_round=num_rounds,\
                     early_stopping_rounds=50,evals=eval_list)

In [None]:
pred_probs = xgb_model.predict(dtest)
print('예측 확률 10개 표시',np.round(pred_probs[:10],3))
preds = [1 if x > 0.5 else 0 for x in pred_probs]
print('예측값 10개 표시',preds[:10])

In [None]:
pred_probs.mean()

In [None]:
pred_probs

In [None]:
np.mean(preds)

In [None]:
preds

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

# 수정된 get_clf_eval() 함수 
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
get_clf_eval(y_test,preds,pred_probs)

In [None]:
def scaling(df_scaling, scaled_form = 'MinMaxScaler()'):
    cate_features = df_scaling[['gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']]
    scaled_features = df_scaling.drop(cate_features.columns, axis=1)
    if scaled_form == 'StandardScaler()':
        # Standard scaler
        scaler = StandardScaler()
        scaler.fit(scaled_features)
        scaled = scaler.transform(scaled_features)
        standard_scaled = pd.DataFrame(scaled,columns = scaled_features.columns)
        standard_scaled[cate_features.columns] = cate_features

        return standard_scaled
    
    elif scaled_form == 'RobustScaler()':
        # Robust scaler
        scaler_R = RobustScaler()
        scaler_R.fit(scaled_features)
        scaled_R = scaler_R.transform(scaled_features)
        robust_scaled = pd.DataFrame(scaled_R,columns = scaled_features.columns)
        robust_scaled[cate_features.columns] = cate_features
        
        return robust_scaled
        
    else:
        # MinMax scaler
        scaler_M = MinMaxScaler()
        scaler_M.fit(scaled_features)
        scaled_M = scaler_M.transform(scaled_features)
        min_max_scaled = pd.DataFrame(scaled_M,columns = scaled_features.columns)
        min_max_scaled[cate_features.columns] = cate_features

        return min_max_scaled

In [None]:
std_scaled = scaling(df, 'StandardScaler()')
std_scaled

In [None]:
rob_scaled = scaling(df, 'RobustScaler()')
rob_scaled

In [None]:
min_max_scaled = scaling(df)
min_max_scaled

In [None]:
x_train, x_test, y_train, y_test = train_test_split(std_scaled,target,test_size=0.2,shuffle=True,random_state=0)

x_tr, x_val, y_tr, y_val = train_test_split(x_train,y_train,test_size=0.1,random_state=0)
print(x_train.shape, x_test.shape)
print(x_tr.shape, x_val.shape)

dtr = xgb.DMatrix(data=x_tr,label=y_tr)
dval = xgb.DMatrix(data=x_val,label=y_val)
dtest = xgb.DMatrix(data=x_test,label=y_test)

# 파라미터 설정
params = { 'max_depth':3,
           'eta': 0.1,  # learning_rate
           'objective':'binary:logistic', 
           'eval_metric':'logloss',
          'silent' : 0,
           'verbosity':0
        }
num_rounds = 400

eval_list = [(dtr,'train'),(dval,'eval') ] # eval 평가용

xgb_model = xgb.train(params=params,dtrain=dtr,num_boost_round=num_rounds,\
                     early_stopping_rounds=50,evals=eval_list)

pred_probs = xgb_model.predict(dtest)
print('예측 확률 10개 표시',np.round(pred_probs[:10],3))
preds = [1 if x > 0.5 else 0 for x in pred_probs]
print('예측값 10개 표시',preds[:10])

In [None]:
get_clf_eval(y_test,preds,pred_probs)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(rob_scaled,target,test_size=0.2,random_state=0)

x_tr, x_val, y_tr, y_val = train_test_split(x_train,y_train,test_size=0.1,random_state=0)
print(x_train.shape, x_test.shape)
print(x_tr.shape, x_val.shape)

dtr = xgb.DMatrix(data=x_tr,label=y_tr)
dval = xgb.DMatrix(data=x_val,label=y_val)
dtest = xgb.DMatrix(data=x_test,label=y_test)

# 파라미터 설정
params = { 'max_depth':3,
           'eta': 0.1,  # learning_rate
           'objective':'binary:logistic', 
           'eval_metric':'logloss',
          'silent' : 0,
           'verbosity':0
        }
num_rounds = 400

eval_list = [(dtr,'train'),(dval,'eval') ] # eval 평가용

xgb_model = xgb.train(params=params,dtrain=dtr,num_boost_round=num_rounds,\
                     early_stopping_rounds=50,evals=eval_list)

pred_probs = xgb_model.predict(dtest)
print('예측 확률 10개 표시',np.round(pred_probs[:10],3))
preds = [1 if x > 0.5 else 0 for x in pred_probs]
print('예측값 10개 표시',preds[:10])

In [None]:
get_clf_eval(y_test,preds,pred_probs)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(min_max_scaled,target,test_size=0.2,random_state=0)

x_tr, x_val, y_tr, y_val = train_test_split(x_train,y_train,test_size=0.1,random_state=0)
print(x_train.shape, x_test.shape)
print(x_tr.shape, x_val.shape)

dtr = xgb.DMatrix(data=x_tr,label=y_tr)
dval = xgb.DMatrix(data=x_val,label=y_val)
dtest = xgb.DMatrix(data=x_test,label=y_test)

# 파라미터 설정
params = { 'max_depth':3,
           'eta': 0.1,  # learning_rate
           'objective':'binary:logistic', 
           'eval_metric':'logloss',
          'silent' : 0,
           'verbosity':0
        }
num_rounds = 400

eval_list = [(dtr,'train'),(dval,'eval') ] # eval 평가용

xgb_model = xgb.train(params=params,dtrain=dtr,num_boost_round=num_rounds,\
                     early_stopping_rounds=50,evals=eval_list)

pred_probs = xgb_model.predict(dtest)
print('예측 확률 10개 표시',np.round(pred_probs[:10],3))
preds = [1 if x > 0.5 else 0 for x in pred_probs]
print('예측값 10개 표시',preds[:10])

In [None]:
get_clf_eval(y_test,preds,pred_probs)