In [1]:
import pandas as pd 
import numpy as np

#데이터 불러오기

df_delete = pd.read_csv('dataset/delete_null.csv')
df_mean = pd.read_csv('dataset/mean_null.csv')
df_mode = pd.read_csv('dataset/mode_null.csv')

In [2]:
#스케일링 도중 생성된 결측행 제거
df_delete = df_delete.dropna()
df_mode = df_mode.dropna()
df_mean = df_mean.dropna()


### Trainset / testset 분할

In [3]:
X = df_delete.drop(['Total_slp_wd_standard','SitTime_standard','Total_slp_wd_scaled','SitTime_scaled','BMI'],axis='columns') 
y = df_delete['BMI']

In [4]:
X

Unnamed: 0,sex,age,D_1_1,Total_slp_wd,BP1,BO1,BE5_1,BP5,SitTime
0,1.0,13.0,3.0,480.0,4.0,5.0,1.0,2.0,540.0
1,1.0,16.0,2.0,420.0,3.0,3.0,1.0,2.0,540.0
2,2.0,15.0,2.0,540.0,3.0,4.0,1.0,2.0,600.0
3,2.0,17.0,3.0,540.0,3.0,4.0,1.0,1.0,1020.0
4,1.0,12.0,2.0,480.0,3.0,2.0,1.0,2.0,600.0
...,...,...,...,...,...,...,...,...,...
1520,2.0,16.0,2.0,600.0,4.0,3.0,1.0,2.0,660.0
1521,2.0,16.0,2.0,720.0,3.0,3.0,1.0,2.0,720.0
1522,1.0,14.0,3.0,570.0,3.0,3.0,1.0,2.0,420.0
1523,2.0,12.0,2.0,660.0,3.0,4.0,1.0,2.0,630.0


In [5]:
y.value_counts()

0.0    1268
1.0     242
Name: BMI, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# test_size : 테스트 데이터셋의 비율(float)이나 갯수(int) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=3)

In [7]:
y_test.value_counts()

0.0    317
1.0     61
Name: BMI, dtype: int64

# 모델링 (random forest)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import StratifiedKFold

#rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
#rf.fit(X_train, y_train)

#최적 파라미터 값 찾기
params = { 'n_estimators' : [10, 100,200],
           'max_depth' : [6, 8, 10],
           'min_samples_leaf' : [3,5 ,7],
           'min_samples_split' : [2,3,6]}

rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
kfold = KFold(n_splits=5, shuffle=True, random_state=0) #업샘플링할때
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) #업샘플링 하지 않았을때
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = kfold, n_jobs = -1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 8, 'min_samples_leaf': 7, 'min_samples_split': 2, 'n_estimators': 100}
최고 예측 정확도: 0.8914


In [18]:
scores_df = pd.DataFrame(grid_cv.cv_results_)
scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.045277,0.009866,0.017354,0.002646,6,3,2,10,"{'max_depth': 6, 'min_samples_leaf': 3, 'min_s...",0.876652,0.841410,0.871681,0.920354,0.902655,0.882550,0.027125,49
1,0.279451,0.014430,0.038098,0.009193,6,3,2,100,"{'max_depth': 6, 'min_samples_leaf': 3, 'min_s...",0.894273,0.837004,0.884956,0.911504,0.889381,0.883424,0.024895,40
2,0.675593,0.041806,0.068816,0.012568,6,3,2,200,"{'max_depth': 6, 'min_samples_leaf': 3, 'min_s...",0.894273,0.841410,0.889381,0.911504,0.893805,0.886075,0.023578,19
3,0.043882,0.007108,0.015957,0.005389,6,3,3,10,"{'max_depth': 6, 'min_samples_leaf': 3, 'min_s...",0.876652,0.841410,0.871681,0.920354,0.902655,0.882550,0.027125,49
4,0.385569,0.055495,0.044481,0.010241,6,3,3,100,"{'max_depth': 6, 'min_samples_leaf': 3, 'min_s...",0.894273,0.837004,0.884956,0.911504,0.889381,0.883424,0.024895,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,0.506845,0.084893,0.043684,0.012732,10,7,3,100,"{'max_depth': 10, 'min_samples_leaf': 7, 'min_...",0.894273,0.841410,0.893805,0.915929,0.889381,0.886960,0.024577,10
77,0.773533,0.021836,0.077192,0.014476,10,7,3,200,"{'max_depth': 10, 'min_samples_leaf': 7, 'min_...",0.894273,0.841410,0.889381,0.915929,0.889381,0.886075,0.024394,19
78,0.052459,0.002863,0.036103,0.008112,10,7,6,10,"{'max_depth': 10, 'min_samples_leaf': 7, 'min_...",0.894273,0.850220,0.880531,0.898230,0.880531,0.880757,0.016856,61
79,0.481512,0.080191,0.040492,0.007792,10,7,6,100,"{'max_depth': 10, 'min_samples_leaf': 7, 'min_...",0.894273,0.841410,0.893805,0.915929,0.889381,0.886960,0.024577,10


In [19]:
df_results = scores_df[['params', 'mean_test_score', 'rank_test_score', \
           'split0_test_score', 'split1_test_score', 'split2_test_score','split3_test_score','split4_test_score']]

In [20]:
df_results.sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
52,"{'max_depth': 8, 'min_samples_leaf': 7, 'min_s...",0.891377,1,0.903084,0.841410,0.902655,0.920354,0.889381
46,"{'max_depth': 8, 'min_samples_leaf': 7, 'min_s...",0.891377,1,0.903084,0.841410,0.902655,0.920354,0.889381
49,"{'max_depth': 8, 'min_samples_leaf': 7, 'min_s...",0.891377,1,0.903084,0.841410,0.902655,0.920354,0.889381
25,"{'max_depth': 6, 'min_samples_leaf': 7, 'min_s...",0.888729,4,0.898678,0.837004,0.893805,0.911504,0.902655
19,"{'max_depth': 6, 'min_samples_leaf': 7, 'min_s...",0.888729,4,0.898678,0.837004,0.893805,0.911504,0.902655
...,...,...,...,...,...,...,...,...
30,"{'max_depth': 8, 'min_samples_leaf': 3, 'min_s...",0.870169,76,0.881057,0.828194,0.871681,0.889381,0.880531
33,"{'max_depth': 8, 'min_samples_leaf': 3, 'min_s...",0.870169,76,0.881057,0.828194,0.871681,0.889381,0.880531
12,"{'max_depth': 6, 'min_samples_leaf': 5, 'min_s...",0.866641,79,0.872247,0.823789,0.876106,0.880531,0.880531
15,"{'max_depth': 6, 'min_samples_leaf': 5, 'min_s...",0.866641,79,0.872247,0.823789,0.876106,0.880531,0.880531


# 모델 퍼모먼스 함수

### 업샘플링 데이터 시,  단순 교차검증을 사용

In [10]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(grid_cv, X_test, y_test, cv=5)
scores_re = cross_val_score(grid_cv, X_test, y_test, cv=5, scoring='recall')
scores_pre = cross_val_score(grid_cv, X_test, y_test, cv=5, scoring='precision')
print("Accuracy mean : ", scores.mean())
print("Recall mean : ",scores_re.mean())
print("Precision mean : ",scores_pre.mean())

Accuracy mean :  0.8517192982456139
Recall mean :  0.2935897435897436
Precision mean :  0.6016666666666667


### 편향된 데이터이므로 계층별 k-겹 교차검증을 사용

평가할 때도 label의 분포를 유지시키며 cv별 올바른 평가를 위함

In [69]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

scores = cross_val_score(grid_cv, X_test, y_test, cv=skf)
scores_re = cross_val_score(grid_cv, X_test, y_test, cv=skf, scoring='recall')
scores_pre = cross_val_score(grid_cv, X_test, y_test, cv=skf, scoring='precision')
print("Accuracy mean : ", scores.mean())
print("Recall mean : ",scores_re.mean())
print("Precision mean : ",scores_pre.mean())


Accuracy mean :  0.8756491228070177
Recall mean :  0.3128205128205128
Precision mean :  0.7928571428571429


In [40]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
def model_evaluation(label, predict):
    cf_matrix = confusion_matrix(label, predict)
    Accuracy = (cf_matrix[0][0] + cf_matrix[1][1]) / sum(sum(cf_matrix))
    Precision = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[0][1])
    Recall = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[1][0])
    Specificity = cf_matrix[0][0] / (cf_matrix[0][0] + cf_matrix[0][1])
    F1_Score = (2 * Recall * Precision) / (Recall + Precision)
    F2_Score = (5 * Recall * Precision) / (Recall + 4*Precision)
    
    print("Accuracy: ", Accuracy)
    print("Precision: ", Precision)
    print("Recall: ", Recall)
    print("Specificity: ", Specificity)
    print("F1-Score: ", F1_Score)
    print("F2-Score: ", F2_Score)
    print("auc score: " , roc_auc_score(label, np.round(predict,0)))


In [41]:
predicted = grid_cv.predict(X_test)
print(confusion_matrix(y_test, predicted))
print(model_evaluation(y_test, predicted))

[[311   6]
 [ 36  25]]
Accuracy:  0.8888888888888888
Precision:  0.8064516129032258
Recall:  0.4098360655737705
Specificity:  0.9810725552050473
F1-Score:  0.5434782608695652
F2-Score:  0.4545454545454546
auc score:  0.695454310389409
None


# 변수 중요도

참조 https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html

In [29]:
import eli5
from eli5.sklearn import PermutationImportance 

perm = PermutationImportance(grid_cv, scoring = "f1", random_state = 0).fit(X_train, y_train) 
df_impt = eli5.show_weights(perm, top = 80, feature_names = X_train.columns.tolist())

# 변수중요도 추출
df_impt

Weight,Feature
0.4993  ± 0.0616,BO1
0.0917  ± 0.0300,age
0.0755  ± 0.0303,Total_slp_wd
0.0719  ± 0.0440,sex
0.0222  ± 0.0253,D_1_1
0.0208  ± 0.0150,BP1
0.0176  ± 0.0158,SitTime
0.0047  ± 0.0179,BE5_1
0.0017  ± 0.0040,BP5


In [25]:

perm = PermutationImportance(grid_cv, scoring = "f1", random_state = 42).fit(X_train, y_train) 
eli5.show_weights(perm, top = 80, feature_names = X_train.columns.tolist())


Weight,Feature
0.4988  ± 0.0500,BO1
0.1109  ± 0.0368,sex
0.0895  ± 0.0182,age
0.0668  ± 0.0226,Total_slp_wd
0.0204  ± 0.0199,BP1
0.0190  ± 0.0120,SitTime
0.0181  ± 0.0127,D_1_1
0.0041  ± 0.0052,BP5
0.0029  ± 0.0078,BE5_1


In [26]:

perm = PermutationImportance(grid_cv, scoring = "recall", random_state = 42).fit(X_train, y_train) 
eli5.show_weights(perm, top = 80, feature_names = X_train.columns.tolist())


Weight,Feature
0.4398  ± 0.0405,BO1
0.0950  ± 0.0214,age
0.0873  ± 0.0379,sex
0.0773  ± 0.0252,Total_slp_wd
0.0199  ± 0.0193,D_1_1
0.0188  ± 0.0228,BP1
0.0188  ± 0.0113,SitTime
0.0055  ± 0.0070,BP5
0.0044  ± 0.0083,BE5_1


In [23]:
# 변수중요도 시각화
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

result = permutation_importance(grid_cv, X_train, y_train, n_repeats=10,
                                random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()

plt.barh(X_train.columns[sorted_idx], sorted(result.importances_mean))
plt.title('Permutation Importance', fontsize=18)
plt.ylabel('Feature name', fontsize=15)
plt.show()

SyntaxError: positional argument follows keyword argument (<ipython-input-23-f1f35dff6b5a>, line 5)