In [132]:
import os
import pandas as pd
import numpy as np

import joblib
import sidetable

from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from itertools import combinations
from MetsDataByGenderRid import get_mets_data
from MetsEvaluation import get_feature_importance, get_metric, get_calib_metric

from numpy import array
from matplotlib import pyplot as plt
%matplotlib inline

### Dataset

In [252]:
# fts_ctg : 'anthropometric', 'survey-based', 'synthesized'
# fts : Define features to add as a list
# gender : 0 male, 1 female

add_fts = []
tr, val, ts, info, beta, tau, ctg_idxs, ctg_dims = get_mets_data(fts_ctg='', fts = add_fts, gender=None, ONEHOT=True, TABNET=False, mrmr=False, SEED=100)

### Trial 1 : Base Model 
- Default Parameter

In [234]:
X_train = tr.iloc[:,1:]
y_train = tr.iloc[:,0]
X_valid = val.iloc[:,1:]
y_valid = val.iloc[:,0]

args = {'random_state' : 100}
estimator = DecisionTreeClassifier(**args)

base_model = estimator
base_model.fit(X_train, y_train)
fts = get_feature_importance(base_model.feature_importances_, X_train.columns, 30)

### Trial 2. Feature Selection

In [238]:
# RFE Top 30
selector = RFE(estimator, step=1, n_features_to_select=30, verbose=0) #scoring : f1, roc_auc
result = selector.fit(X_train[fts.feature], y_train)
slc_fts = array(X_train[fts.feature].columns)[result.support_] 

# RFECV
selector = RFECV(estimator, step=1, cv=5, scoring='roc_auc', min_features_to_select=5, verbose=0) #scoring : f1, roc_auc
result = selector.fit(X_train[slc_fts], y_train)
slc_fts = X_train[slc_fts].columns[result.support_]

# RFE Top 10
selector = RFE(estimator, step=1, n_features_to_select=10, verbose=0) #scoring : f1, roc_auc
result = selector.fit(X_train[fts.feature], y_train)
slc_fts = array(X_train[fts.feature].columns)[result.support_] 

In [241]:
top_n_model = estimator
top_n_model.fit(X_train[slc_fts],y_train)

fts = get_feature_importance(top_n_model.feature_importances_, X_train[slc_fts].columns, 30)
plt.figure(figsize=(12,8))
plt.barh(fts['feature'], fts['importance'])
plt.gca().invert_yaxis()
plt.show()

### Trial 3. Find Best Combination

In [243]:
best_res = 0
best_ft = []
model = estimator

for j in range(3,len(slc_fts)+1) : #len(slc_fts)+1
    print('# Comb :', j)
    candi = list(combinations(slc_fts, j))
    for i, ft in enumerate(candi):
        
        if i % 1000 == 0 :
            print('====',i+1,'====')
        
        model.fit(X_train.loc[:,ft],y_train)
        prob = model.predict_proba(X_valid.loc[:,ft])
        metric = get_metric(prob, y_valid, 0.5)
        crn = metric['auc']
        if crn > best_res :
            best_res = crn
            best_ft = ft
    print('Best AUC :',best_res, ' Features : ', best_ft)

# Comb : 3
==== 1 ====
Best AUC : 0.7961609921424587  Features :  ('BPWC_add', 'WC', 'BPWC_mul')
# Comb : 4
==== 1 ====
Best AUC : 0.7961609921424587  Features :  ('BPWC_add', 'WC', 'BPWC_mul')
# Comb : 5
==== 1 ====
Best AUC : 0.7961609921424587  Features :  ('BPWC_add', 'WC', 'BPWC_mul')
# Comb : 6
==== 1 ====
Best AUC : 0.7961609921424587  Features :  ('BPWC_add', 'WC', 'BPWC_mul')
# Comb : 7
==== 1 ====
Best AUC : 0.7961609921424587  Features :  ('BPWC_add', 'WC', 'BPWC_mul')
# Comb : 8
==== 1 ====
Best AUC : 0.7961609921424587  Features :  ('BPWC_add', 'WC', 'BPWC_mul')
# Comb : 9
==== 1 ====
Best AUC : 0.7961609921424587  Features :  ('BPWC_add', 'WC', 'BPWC_mul')
# Comb : 10
==== 1 ====
Best AUC : 0.7961609921424587  Features :  ('BPWC_add', 'WC', 'BPWC_mul')


In [245]:
model = estimator
model.fit(X_train.loc[:,best_ft],y_train)

prob = model.predict_proba(X_valid.loc[:,best_ft])
get_metric(prob, y_valid, 0.5)

fts = get_feature_importance(model.feature_importances_, X_train.loc[:,best_ft].columns, 30)
plt.figure(figsize=(4,3))
plt.barh(fts['feature'], fts['importance'])
plt.gca().invert_yaxis()
plt.show()

{'acc': 0.7168458781362007,
 'bac': 0.7582642787133564,
 'recall': 0.8268733850129198,
 'ppv': 0.3970223325062035,
 'npv': 0.941586748038361,
 'sepecificity': 0.6896551724137931,
 'f1': 0.5364626990779547,
 'auc': 0.7961609921424587}

### Model Tuning

In [248]:
param_grid ={
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [2, 3, 4, 5],
    'min_samples_split' : [0.01, 0.1, 0.2],
    'max_features' : [None, 'log2', 'sqrt'],
    'splitter' : ['best', 'random'],
    'min_samples_leaf' : [50, 100, 200] 
}

kf = KFold(random_state=100,n_splits=5,shuffle=True)

# define grid_search
grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           cv=kf, 
                           scoring='roc_auc',
                           n_jobs=-1, 
                           verbose=1
                          )

grid_search.fit(X_train.loc[:,best_ft], y_train)

best_model = DecisionTreeClassifier(**grid_search.best_params_,**args)
best_model.fit(X_train.loc[:,best_ft],y_train)

prob = best_model.predict_proba(X_valid.loc[:,best_ft])
get_metric(prob, y_valid, 0.5)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


{'acc': 0.738863287250384,
 'bac': 0.7865840981318126,
 'recall': 0.8656330749354005,
 'ppv': 0.4224464060529634,
 'npv': 0.9551724137931035,
 'sepecificity': 0.7075351213282248,
 'f1': 0.5677966101694915,
 'auc': 0.8667468591285753}