In [3]:
import os
import pandas as pd
import numpy as np

import joblib
import sidetable

from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from itertools import combinations
from MetsDataByGender import get_mets_data
from MetsEvaluation import get_feature_importance, get_metric, get_calib_metric

from numpy import array
from matplotlib import pyplot as plt
%matplotlib inline

### Dataset

In [261]:
# fts_ctg : 'anthropometric', 'survey-based', 'synthesized'
# fts : Define features to add as a list
# gender : 0 male, 1 female
add_fts = ['sbp', 'wc', 'dbp', 'BFP', 'bmi', 'CUNBAE', 'BAI','ss11', 'ss10', 'ss24', 'ss13', 'ss07','bWC', 'WC', 'BP', 'wrm', 'G1_INT', 'G2_INT', 'G3_INT2']
tr, val, ts, info, beta, tau, ctg_idxs, ctg_dims = get_mets_data(fts_ctg='', fts = add_fts, gender=0, ONEHOT=True, TABNET=False, mrmr=False, SEED=100)

### Trial 1 : Base Model 
- Default Parameter

In [265]:
X_train = tr.iloc[:,1:]
y_train = tr.iloc[:,0]
X_valid = val.iloc[:,1:]
y_valid = val.iloc[:,0]

args = {'random_state' : 100}
estimator = RandomForestClassifier(**args)

base_model = estimator
base_model.fit(X_train, y_train)
prob = base_model.predict_proba(X_valid)
fts = get_feature_importance(base_model.feature_importances_, X_train.columns, 30)

### Trial 2. Feature Selection Model

In [269]:
# RFE Top 30
selector = RFE(estimator, step=1, n_features_to_select=30, verbose=0) #scoring : f1, roc_auc
result = selector.fit(X_train[fts.feature], y_train)
slc_fts = array(X_train[fts.feature].columns)[result.support_] 

# RFECV
selector = RFECV(estimator, step=1, cv=5, scoring='roc_auc', min_features_to_select=5, verbose=0) #scoring : f1, roc_auc
result = selector.fit(X_train[slc_fts], y_train)
slc_fts = X_train[slc_fts].columns[result.support_]

# RFE Top 10
selector = RFE(estimator, step=1, n_features_to_select=10, verbose=0) #scoring : f1, roc_auc
result = selector.fit(X_train[fts.feature], y_train)
slc_fts = array(X_train[fts.feature].columns)[result.support_] 

top_n_model = estimator
top_n_model.fit(X_train[slc_fts],y_train)
prob = top_n_model.predict_proba(X_valid[slc_fts])
get_metric(prob, y_valid, 0.5)

fts = get_feature_importance(top_n_model.feature_importances_, X_train[slc_fts].columns, 30)
plt.figure(figsize=(12,8))
plt.barh(fts['feature'], fts['importance'])
plt.gca().invert_yaxis()
plt.show()

### Trial 3. Find Best Combination

In [276]:
best_res = 0
best_ft = []
model = estimator

for j in range(3,len(slc_fts)+1) : #len(slc_fts)+1
    print('# Comb :', j)
    candi = list(combinations(slc_fts, j))
    for i, ft in enumerate(candi):
        
        if i % 1000 == 0 :
            print('====',i+1,'====')
        
        model.fit(X_train.loc[:,ft],y_train)
        prob = model.predict_proba(X_valid.loc[:,ft])
        metric = get_metric(prob, y_valid, 0.5)
        crn = metric['auc']
        if crn > best_res :
            best_res = crn
            best_ft = ft
    print('Best AUC :',best_res, ' Features : ', best_ft)

# Comb : 3
==== 1 ====
Best AUC : 0.8574882250255261  Features :  ('BP', 'BFP', 'bWC')
# Comb : 4
==== 1 ====
Best AUC : 0.8628314284773229  Features :  ('BP', 'bWC', 'wrm', 'ss13')
# Comb : 5
==== 1 ====
Best AUC : 0.8708458219426237  Features :  ('wc', 'BP', 'BFP', 'bmi', 'ss13')
# Comb : 6
==== 1 ====
Best AUC : 0.8708458219426237  Features :  ('wc', 'BP', 'BFP', 'bmi', 'ss13')
# Comb : 7
==== 1 ====
Best AUC : 0.8708458219426237  Features :  ('wc', 'BP', 'BFP', 'bmi', 'ss13')
# Comb : 8
==== 1 ====
Best AUC : 0.8708458219426237  Features :  ('wc', 'BP', 'BFP', 'bmi', 'ss13')
# Comb : 9
==== 1 ====
Best AUC : 0.8708458219426237  Features :  ('wc', 'BP', 'BFP', 'bmi', 'ss13')
# Comb : 10
==== 1 ====
Best AUC : 0.8708458219426237  Features :  ('wc', 'BP', 'BFP', 'bmi', 'ss13')


In [278]:
model = estimator
model.fit(X_train.loc[:,best_ft],y_train)
prob = model.predict_proba(X_valid.loc[:,best_ft])
get_metric(prob, y_valid, 0.5)

fts = get_feature_importance(model.feature_importances_, X_train.loc[:,best_ft].columns, 30)
plt.figure(figsize=(12,8))
plt.barh(fts['feature'], fts['importance'])
plt.gca().invert_yaxis()
plt.show()

{'acc': 0.7542242703533026,
 'bac': 0.7943101347122954,
 'recall': 0.8608247422680413,
 'ppv': 0.4394736842105263,
 'npv': 0.9547359597652976,
 'sepecificity': 0.7277955271565495,
 'f1': 0.5818815331010453,
 'auc': 0.8708458219426237}

### Model Tuning

In [281]:
param_grid ={
    'bootstrap': [True, False],
    'max_depth': [4, 5, 6, 7, 8, None],
    'min_samples_leaf': [2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [50, 100, 500, 1000]
}

kf = KFold(random_state=100,n_splits=5,shuffle=True)

# define grid_search
grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           cv=kf, 
                           scoring='roc_auc',
                           n_jobs=-1, 
                           verbose=1
                          )

grid_search.fit(X_train.loc[:,best_ft], y_train)
#grid_search.best_params_

best_model = RandomForestClassifier(**grid_search.best_params_,**args)
best_model.fit(X_train.loc[:,best_ft],y_train)
prob = best_model.predict_proba(X_valid.loc[:,best_ft])
get_metric(prob, y_valid, 0.5)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


{'acc': 0.7516641065028162,
 'bac': 0.7975585455024539,
 'recall': 0.8737113402061856,
 'ppv': 0.4374193548387097,
 'npv': 0.9584040747028862,
 'sepecificity': 0.721405750798722,
 'f1': 0.5829750644883921,
 'auc': 0.8772635947432561}