In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier # for modeling
from sklearn.preprocessing import LabelEncoder # for encoding
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler #for standardization
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import plot_confusion_matrix

In [None]:
def preprocessing(df_scaling):
    # 불필요한 컬럼 제거
    if 'ID' in df_scaling:
        df_scaling = df_scaling.drop("ID", axis = 1)
        if len(df_scaling.columns) == 1:
            return df_scaling
        
    # 시력(eyesight) 데이터 범주화
    # 시력 기준
    # (0.1~0.9): 나쁨(1)
    # (1.0~1.5): 보통(2)
    # (1.6~2.0): 좋음(3)
    # (9.9): 실명(4)

    def func(x):
        if x < 1.0 :
            return 1
        elif x < 1.6 :
            return 2
        elif x <= 2.0 :
            return 3
        else : 
            return 4
    df_scaling['eyesight(left)'] = df_scaling['eyesight(left)'].apply(lambda x:func(x))    
    df_scaling['eyesight(right)'] = df_scaling['eyesight(right)'].apply(lambda x:func(x))
    
    df_scaling = df_scaling.drop('oral', axis = 1) 
    # 범주형 피처 레이블 인코딩 
    cate_features = df_scaling[['gender','tartar']]

    lbe = LabelEncoder()
    lbe.fit_transform(df_scaling["gender"])
    df_scaling["gender"] = lbe.fit_transform(df_scaling["gender"])

    lbe = LabelEncoder()
    lbe.fit_transform(df_scaling["tartar"])
    df_scaling["tartar"] = lbe.fit_transform(df_scaling["tartar"])
    # hearing 피처 1, 2 => 1, 0으로 변환
    df_scaling['hearing(left)'] = df_scaling['hearing(left)'].apply(lambda x: x-2 if x ==2.0 else x )
    df_scaling['hearing(right)'] = df_scaling['hearing(right)'].apply(lambda x: x-2 if x ==2.0 else x )

    # BMI 지수 계산
    # bmi = kg/m^2
    df_scaling['bmi'] = df_scaling['weight(kg)']/((df_scaling['height(cm)']*0.01)**2)
    # wwi(비만 지수) 지수 계산
    df_scaling['wwi'] = df_scaling['waist(cm)']/(df_scaling['weight(kg)'].apply(np.sqrt))
    return df_scaling

In [None]:
def scaling(train_data, test_data, scaled_form = 'MinMaxScaler()'):
    # 형태 별 특성 구분
    train_data = preprocessing(train_data)
    test_data = preprocessing(test_data)
    tr_cate_features = train_data[['gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries','eyesight(left)','eyesight(right)']]
    tr_scaled_features = train_data.drop(tr_cate_features.columns, axis=1)
    
    ts_cate_features = test_data[['gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries','eyesight(left)','eyesight(right)']]
    ts_scaled_features = test_data.drop(ts_cate_features.columns, axis=1)
    
    if scaled_form == 'StandardScaler()':
        # Standard scaler
        scaler = StandardScaler()
        scaler.fit(tr_scaled_features) # 훈련 데이터에 fit() 적용
        
        # 훈련 데이터와 테스트 데이터에 transform()을 통해 변환
        tr_scaled = scaler.transform(tr_scaled_features)
        ts_scaled = scaler.transform(ts_scaled_features)
        
        train_std_scaled = pd.DataFrame(tr_scaled, columns=tr_scaled_features.columns)
        train_std_scaled[tr_cate_features.columns] = tr_cate_features
        
        test_std_scaled = pd.DataFrame(ts_scaled, columns=ts_scaled_features.columns)
        test_std_scaled[ts_cate_features.columns] = ts_cate_features
        
        return train_std_scaled, test_std_scaled
    
    elif scaled_form == 'RobustScaler()':
        # Robust scaler
        scaler = RobustScaler()
        scaler.fit(tr_scaled_features) # 훈련 데이터에 fit() 적용
        
        # 훈련 데이터와 테스트 데이터에 transform()을 통해 변환
        tr_scaled = scaler.transform(tr_scaled_features)
        ts_scaled = scaler.transform(ts_scaled_features)
        
        # 데이터 프레임 형태로 변환
        train_robust_scaled = pd.DataFrame(tr_scaled, columns=tr_scaled_features.columns)
        train_robust_scaled[tr_cate_features.columns] = tr_cate_features
        
        test_robust_scaled = pd.DataFrame(ts_scaled, columns=ts_scaled_features.columns)
        test_robust_scaled[ts_cate_features.columns] = ts_cate_features
        
        return train_robust_scaled, test_robust_scaled
    
    elif scaled_form == 'logScaler' :
        tr_scaled = np.log1p(tr_scaled_features)
        ts_scaled = np.log1p(ts_scaled_features)
         # 데이터 프레임 형태로 변환
        train_log_scaled = pd.DataFrame(tr_scaled, columns = tr_scaled_features.columns)
        train_log_scaled[tr_cate_features.columns] = tr_cate_features
        
        test_log_scaled = pd.DataFrame(ts_scaled,columns = ts_scaled_features.columns)
        test_log_scaled[ts_cate_features.columns] = ts_cate_features 
        
        return train_log_scaled, test_log_scaled
    else:
        # MinMax scaler
        scaler = MinMaxScaler()
        scaler.fit(tr_scaled_features) # 훈련 데이터에 fit() 적용
        
        # 훈련 데이터와 테스트 데이터에 transform()을 통해 변환
        tr_scaled = scaler.transform(tr_scaled_features)
        ts_scaled = scaler.transform(ts_scaled_features)
        
        # 데이터 프레임 형태로 변환
        train_mmx_scaled = pd.DataFrame(tr_scaled, columns = tr_scaled_features.columns)
        train_mmx_scaled[tr_cate_features.columns] = tr_cate_features
        
        test_mmx_scaled = pd.DataFrame(ts_scaled,columns = ts_scaled_features.columns)
        test_mmx_scaled[ts_cate_features.columns] = ts_cate_features

        return train_mmx_scaled, test_mmx_scaled

In [None]:
# 파일 로드
x_train = pd.read_csv('./Smoking_raw/competition_format/x_train.csv')
x_test = pd.read_csv('./Smoking_raw/competition_format/x_test.csv')
y_train = pd.read_csv('./Smoking_raw/competition_format/y_train.csv')
y_test = pd.read_csv('./Smoking_raw/competition_format/y_test.csv')

- logScaler 변환 후 모델링

In [None]:
scaled = scaling(x_train, x_test,scaled_form = 'logScaler')

x_train_log = pd.DataFrame(scaled[0])
x_test_log = pd.DataFrame(scaled[1])

y_train = preprocessing(y_train) # id 제거 
y_test = preprocessing(y_test) # id 제거 

In [None]:
x_train_log

In [None]:
# x_train 시각화
x_train_log.hist(figsize = (20, 20), bins = 12, legend=False)

### 랜덤포레스트(Random Forest) 적용

In [None]:
# 랜덤포레스트 객체 생성
from sklearn.metrics import accuracy_score, precision_score , recall_score
rf_clf_100 = RandomForestClassifier(random_state=0, n_estimators=50)
rf_clf_100.fit(x_train_log, y_train)
pred = rf_clf_100.predict(x_test_log)
accuracy = accuracy_score(y_test, pred)
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))

In [None]:
# 정확도 / 정밀도 / 재현율
print("정확도 : ", accuracy_score(y_test, pred))
print("정밀도 : ", precision_score(y_test, pred))
print("재현율 : ", recall_score(y_test, pred))

### GridSearchCV로 교차검증 및 하이퍼 파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[50],
    'max_depth' : [6,8,10,12],
    'min_samples_leaf' : [8,12,18],
    'min_samples_split' : [8,16,20]
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)

grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(x_train_log, y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

In [None]:
# 튜닝된 하이퍼파라미터로 재학습/예측/평가 - 분류기 수를 확장
rf_clf = RandomForestClassifier(n_estimators=50, max_depth=12, min_samples_leaf=8, \
                                 min_samples_split=8, random_state=0)
rf_clf.fit(x_train_log,y_train)
pred= rf_clf.predict(x_test_log)
print('예측 정확도: {0:.4f}'.format(accuracy_score(y_test, pred)))

In [None]:
rf_clf.score(x_train_log, y_train)

### 개별 feature들의 중요도 시각화

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_import = rf_clf.feature_importances_
ftr_import_se = pd.Series(ftr_import, index=x_train_log.columns)
ftr_import = ftr_import_se.sort_values(ascending=False)#[:20] # 중요도가 높은 20개의 피처만 추출
ftr_import = ftr_import[1:]

plt.figure(figsize=(8,6))
plt.title('Feature importances')
# x축은 중요도 값, y축은 ftr_top20 시리즈의 index
sns.barplot(x=ftr_import,y=ftr_import.index) # 가로막대 그래프
plt.show()

### ROC AUC Curve

In [None]:
rf_roc_auc = roc_auc_score(y_test, rf_clf.predict(x_test_log))
fpr, tpr, thresholds = roc_curve(y_test, rf_clf.predict_proba(x_test_log)[:,1])

plt.figure()
plt.plot(fpr, tpr, label = 'AUC (area = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1],'g--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()

### Confusion Matrix

In [None]:
plot_confusion_matrix(rf_clf, x_test_log, y_test, cmap = plt.cm.Greens, normalize = "true");