In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statistics
import random

# Feature Selection
# Filter
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.feature_selection import VarianceThreshold
from kydavra import MultiSURFSelector
# Wrapper
import shap
from sklearn.feature_selection import RFE
from boruta import BorutaPy

# model
import xgboost as xgb
from xgboost import XGBClassifier

# parameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# evaluate
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score


from sklearn.feature_selection import SelectKBest

# from functools import reduce

In [3]:
# for threshold
def filter_arr(sample_arr, filter_index):
    result_val = sample_arr[filter_index]
    result_index = filter_index
    compare_index = filter_index + 1
    if sample_arr[0] == sample_arr[filter_index]:
        return (0,0)
        
    if (len(sample_arr) -1 == filter_index):
        compare_index = filter_index - 1
    
    if (sample_arr[filter_index] == sample_arr[compare_index]):
        for index, val in enumerate(sample_arr):
            if (val == result_val):
                result_val = sample_arr[index - 1]
                result_index = index - 1

    return (result_val, result_index)



In [6]:
pd.set_option('display.max_columns', None)

In [4]:
df = pd.read_csv('LSVT_voice_rehabilitation_prepareddata.csv', header=0)
df

Unnamed: 0,Jitter->F0_abs_dif,Jitter->F0_dif_percent,Jitter->F0_PQ5_classical_Schoentgen,Jitter->F0_PQ5_classical_Baken,Jitter->F0_PQ5_generalised_Schoentgen,Jitter->F0_abs0th_perturb,Jitter->F0_CV,Jitter->F0_TKEO_mean,Jitter->F0_TKEO_std,Jitter->F0_TKEO_prc5,...,det_TKEO_std4_4_coef,det_TKEO_std4_5_coef,det_TKEO_std4_6_coef,det_TKEO_std4_7_coef,det_TKEO_std4_8_coef,det_TKEO_std4_9_coef,det_TKEO_std4_10_coef,Age,Gender,class
0,0.088112,0.041697,0.000480,-3.723300e-06,0.000422,2.458381,6.332160e-07,47.021079,1366.430390,-7.103323,...,54.335046,145.528630,375.097397,921.296579,2137.079844,4697.131077,9931.208257,68,1,0
1,0.161798,0.057364,0.000677,5.466360e-06,0.000206,2.592066,7.228520e-07,93.557936,2582.922776,-23.284761,...,60.993338,163.560972,421.010306,1036.092589,2404.072562,5284.082128,11165.095660,68,1,1
2,0.554508,0.642913,0.007576,-7.443870e-07,0.006488,12.691326,6.946250e-04,52.988422,466.682635,-45.308680,...,38.641654,103.466808,264.654626,649.657090,1507.384591,3315.804236,6974.600636,68,1,1
3,0.031089,0.027108,0.000314,-2.214720e-07,0.000216,0.754288,1.868650e-07,13.982754,417.217249,-1.207741,...,42.943275,115.014976,296.320795,728.284936,1689.586636,3713.818933,7851.139360,68,0,0
4,0.076177,0.039071,0.000302,2.732110e-05,0.001102,1.270034,4.918190e-05,56.373996,1608.317410,-3.491990,...,52.715873,141.113865,363.511021,893.246151,2071.625622,4554.204815,9623.566242,68,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,0.116214,0.070546,0.000837,2.765070e-06,0.000333,1.890006,9.844260e-07,34.083311,896.672947,-10.446397,...,50.085484,134.123291,345.396264,850.942761,1973.383824,4336.099395,9158.984652,63,0,1
122,0.700258,0.334397,0.003959,8.297260e-06,0.001516,4.557797,1.581900e-05,104.648435,1583.166169,-97.281717,...,55.178858,147.584708,379.897760,935.982559,2166.960428,4769.956102,10067.750440,63,0,1
123,0.072635,0.050743,0.000597,-5.277520e-06,0.000434,6.984651,4.993260e-07,21.859427,625.288493,-4.116001,...,46.893015,125.687344,323.728298,795.715774,1845.609006,4056.256338,8583.121863,69,0,0
124,0.111362,0.054237,0.000646,-1.546670e-06,0.000277,1.935398,4.398080e-07,47.870508,1367.843467,-9.373059,...,54.508453,146.094710,376.377835,926.435019,2147.499571,4717.270683,9966.759379,69,0,1


In [5]:
col_names = list(df.columns[:-2])
features = df[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
df1 = pd.DataFrame(features, columns = col_names)
df1['Gender'] = df['Gender']
df1['class']  = df['class']

df1

Unnamed: 0,Jitter->F0_abs_dif,Jitter->F0_dif_percent,Jitter->F0_PQ5_classical_Schoentgen,Jitter->F0_PQ5_classical_Baken,Jitter->F0_PQ5_generalised_Schoentgen,Jitter->F0_abs0th_perturb,Jitter->F0_CV,Jitter->F0_TKEO_mean,Jitter->F0_TKEO_std,Jitter->F0_TKEO_prc5,...,det_TKEO_std4_4_coef,det_TKEO_std4_5_coef,det_TKEO_std4_6_coef,det_TKEO_std4_7_coef,det_TKEO_std4_8_coef,det_TKEO_std4_9_coef,det_TKEO_std4_10_coef,Age,Gender,class
0,-0.399377,-0.431252,-0.427775,-0.193157,-0.349789,-0.185255,-0.190603,-0.330321,-0.264576,0.330353,...,0.085308,0.086062,0.086821,0.068374,0.070190,0.072930,0.086653,0.974953,1,0
1,-0.185464,-0.343912,-0.334808,0.393032,-0.467868,-0.179726,-0.190500,0.106770,0.484146,0.164895,...,0.953682,0.970554,0.959255,0.960115,0.974571,0.974684,0.994152,0.974953,1,1
2,0.954598,2.920368,2.925892,-0.003139,2.967826,0.237966,0.609078,-0.274274,-0.818349,-0.060302,...,-1.961422,-1.977083,-2.011804,-2.041734,-2.062769,-2.049252,-2.087872,0.974953,1,1
3,-0.564921,-0.512586,-0.506216,0.030217,-0.462049,-0.255734,-0.191117,-0.640630,-0.848794,0.390636,...,-1.400405,-1.410642,-1.410086,-1.430949,-1.445598,-1.437768,-1.443196,0.974953,0,0
4,-0.434025,-0.445895,-0.512084,1.787099,0.022002,-0.234403,-0.134661,-0.242475,-0.115700,0.367279,...,-0.125864,-0.130483,-0.133342,-0.149523,-0.151522,-0.146653,-0.139612,0.974953,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,-0.317797,-0.270429,-0.259019,0.220722,-0.398413,-0.208762,-0.190198,-0.451838,-0.553700,0.296169,...,-0.468919,-0.473373,-0.477558,-0.478137,-0.484295,-0.481736,-0.481302,0.172050,0,1
122,1.377720,1.200474,1.216387,0.573609,0.248428,-0.098426,-0.173105,0.210936,-0.131180,-0.591733,...,0.195358,0.186914,0.178037,0.182456,0.171404,0.184814,0.187077,0.172050,0,1
123,-0.444310,-0.380824,-0.372500,-0.292298,-0.343190,0.001946,-0.190757,-0.566649,-0.720731,0.360899,...,-0.885281,-0.887159,-0.889291,-0.907143,-0.917105,-0.911669,-0.904837,1.135533,0,0
124,-0.331882,-0.361349,-0.349708,-0.054315,-0.428717,-0.206885,-0.190826,-0.322343,-0.263706,0.307144,...,0.107924,0.113829,0.111152,0.108290,0.105485,0.103871,0.112800,1.135533,0,1


In [6]:
# df1['class'] = df1.apply(lambda row: categorise(row), axis=1)
print('class 0 = ',df1['class'].value_counts()[0],'ตัว')
print('class 1 = ',df1['class'].value_counts()[1],'ตัว')
print('Majority Class= ', df1['class'].value_counts()[0]/len(df1))

class 0 =  42 ตัว
class 1 =  84 ตัว
Majority Class=  0.3333333333333333


In [7]:
# threshold:

log_2base = math.log(len(df1),2.0)
base2n = int(round(log_2base,0))
p2_5 = int(round(len(df1.columns)*0.025,0))
p5 =  int(round(len(df1.columns)*0.05,0))
p10 =  int(round(len(df1.columns)*0.1,0))
p20 =  int(round(len(df1.columns)*0.2,0))

print('treshold logbase2(n):', base2n)
print('treshold 2.5%:',p2_5)
print('treshold 5%:',p5)
print('treshold 10%:',p10)
print('treshold 20%:',p20)

thres_d = {'base2n':base2n,
              'p2_5':p2_5,
              'p5':p5,
              'p10':p10,
              'p20':p20}


thres_d


treshold logbase2(n): 7
treshold 2.5%: 8
treshold 5%: 16
treshold 10%: 31
treshold 20%: 63


{'base2n': 7, 'p2_5': 8, 'p5': 16, 'p10': 31, 'p20': 63}

In [8]:
X_ = df1.drop(['class'], axis=1)
len(X_.columns)

312

In [9]:
paramXGBoost_grid1 = {"subsample":[0.5, 0.75, 1],
              "colsample_bytree":[0.5, 0.75, 1],
              "max_depth":[5, 6, 7],
              "min_child_weight":[1,5,15],
              "learning_rate":[0.1, 0.01, 0.05],
              'eta': [0.01, 0.015, 0.02]
    }

In [10]:
paramXGBoost_grid2 = {"subsample":[0.2,0.6,0.9],
              "colsample_bytree":[0.5, 0.75, 1],
              "max_depth":[7,15,25],
              "min_child_weight":[0.1,0.5,1],
              "learning_rate":[0.1, 0.01, 0.05],
              'eta': [0.01, 0.015, 0.02]
    }

In [11]:
paramXGBoost_grid3 = {"subsample":[0.2,0.6,0.9],
              "colsample_bytree":[0.1,0.25, 0.5],
              "max_depth":[10,15,20],
              "min_child_weight":[20,40,60],
              "learning_rate":[0.1, 0.01, 0.05],
              'eta': [0.01, 0.015, 0.02]
    }

In [12]:
paramXGBoost_grid4 = {"subsample":[0.75,0.9,1],
              "colsample_bytree":[0.1,0.25, 0.5],
              "max_depth":[5,10,15],
              "min_child_weight":[5,10,20],
              "learning_rate":[0.1, 0.01, 0.05],
              'eta': [0.01, 0.015, 0.02]
    }

In [13]:
param_d = {'set1':paramXGBoost_grid1,
           'set2':paramXGBoost_grid2,
           'set3':paramXGBoost_grid3,
           'set4':paramXGBoost_grid4
          }

param_d

{'set1': {'subsample': [0.5, 0.75, 1],
  'colsample_bytree': [0.5, 0.75, 1],
  'max_depth': [5, 6, 7],
  'min_child_weight': [1, 5, 15],
  'learning_rate': [0.1, 0.01, 0.05],
  'eta': [0.01, 0.015, 0.02]},
 'set2': {'subsample': [0.2, 0.6, 0.9],
  'colsample_bytree': [0.5, 0.75, 1],
  'max_depth': [7, 15, 25],
  'min_child_weight': [0.1, 0.5, 1],
  'learning_rate': [0.1, 0.01, 0.05],
  'eta': [0.01, 0.015, 0.02]},
 'set3': {'subsample': [0.2, 0.6, 0.9],
  'colsample_bytree': [0.1, 0.25, 0.5],
  'max_depth': [10, 15, 20],
  'min_child_weight': [20, 40, 60],
  'learning_rate': [0.1, 0.01, 0.05],
  'eta': [0.01, 0.015, 0.02]},
 'set4': {'subsample': [0.75, 0.9, 1],
  'colsample_bytree': [0.1, 0.25, 0.5],
  'max_depth': [5, 10, 15],
  'min_child_weight': [5, 10, 20],
  'learning_rate': [0.1, 0.01, 0.05],
  'eta': [0.01, 0.015, 0.02]}}

# XGBoost

In [14]:
# baseline model

X = df1.drop(['class'], axis=1)
y = df1['class']
 


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) 

""" do not need to split the data into training and test sets when using 
cross_val_score from the scikit-learn library. The purpose of 
cross_val_score is to perform cross-validation on the entire dataset, 
which automatically splits the data into training and validation sets 
for each fold. """
# XGBoost
xgbc = XGBClassifier(random_state=123)

## results F1 and stdF1 ##
avgF1_baselineXGBoost = cross_val_score(xgbc, X, y, scoring="f1", cv = 10).mean() ##<3## 
stdF1_baselineXGBoost = cross_val_score(xgbc, X, y, scoring="f1", cv = 10).std() ##<3##
print('Mean F1 Score: ', avgF1_baselineXGBoost)
print('std F1 Score: ', stdF1_baselineXGBoost)



Mean F1 Score:  0.8643578643578642
std F1 Score:  0.04772011171804006


In [15]:
n_baselineXGBoost = len(X_.columns)
n_baselineXGBoost

312

In [16]:
# tuned baseline model_ parameterset1 


# Instantiate the grid search model
grid_search = GridSearchCV(estimator = xgbc, param_grid = paramXGBoost_grid1,
                           cv = 5, n_jobs = -1, verbose = 2,scoring='f1')
# Fit the grid search to the data
grid_search.fit(X,y)

print(grid_search.best_params_)


xgbc_tuned1 = XGBClassifier(subsample=grid_search.best_params_['subsample'],
                                   max_depth=grid_search.best_params_['max_depth'],
                                   colsample_bytree=grid_search.best_params_['colsample_bytree'],
                                   min_child_weight=grid_search.best_params_['min_child_weight'],
                                   learning_rate=grid_search.best_params_['learning_rate'],
                                   eta=grid_search.best_params_['eta'])
xgbc_tuned1.fit(X,y)

print('\n')

## results F1 and stdF1 ##
avgF1set1_baselineXGBoost = cross_val_score(xgbc_tuned1, X, y, scoring="f1", cv = 10).mean() ##<3## 
stdF1set1_baselineXGBoost = cross_val_score(xgbc_tuned1, X, y, scoring="f1", cv = 10).std() ##<3##
print('Mean F1 Score: ', avgF1set1_baselineXGBoost)
print('std F1 Score: ', stdF1set1_baselineXGBoost)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
{'colsample_bytree': 1, 'eta': 0.01, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.75}


Mean F1 Score:  0.9003102453102454
std F1 Score:  0.05479680704989319


In [17]:
# tuned baseline model_ parameterset2


# Instantiate the grid search model
grid_search = GridSearchCV(estimator = xgbc, param_grid = paramXGBoost_grid2,
                           cv = 5, n_jobs = -1, verbose = 2,scoring='f1')
# Fit the grid search to the data
grid_search.fit(X,y)


print(grid_search.best_params_)


xgbc_tuned2 = XGBClassifier(subsample=grid_search.best_params_['subsample'],
                                   max_depth=grid_search.best_params_['max_depth'],
                                   colsample_bytree=grid_search.best_params_['colsample_bytree'],
                                   min_child_weight=grid_search.best_params_['min_child_weight'],
                                   learning_rate=grid_search.best_params_['learning_rate'],
                                   eta=grid_search.best_params_['eta'])
xgbc_tuned2.fit(X,y)

print('\n')

## results F1 and stdF1 ##
avgF1set2_baselineXGBoost = cross_val_score(xgbc_tuned2, X, y, scoring="f1", cv = 10).mean() ##<3## 
stdF1set2_baselineXGBoost = cross_val_score(xgbc_tuned2, X, y, scoring="f1", cv = 10).std() ##<3##
print('Mean F1 Score: ', avgF1set2_baselineXGBoost)
print('std F1 Score: ', stdF1set2_baselineXGBoost)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
{'colsample_bytree': 0.5, 'eta': 0.01, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 0.5, 'subsample': 0.6}


Mean F1 Score:  0.8961062303709362
std F1 Score:  0.06205463659536259


In [18]:
# tuned baseline model_ parameterset3


# Instantiate the grid search model
grid_search = GridSearchCV(estimator = xgbc, param_grid = paramXGBoost_grid3,
                           cv = 5, n_jobs = -1, verbose = 2,scoring='f1')
# Fit the grid search to the data
grid_search.fit(X,y)


print(grid_search.best_params_)


xgbc_tuned3 = XGBClassifier(subsample=grid_search.best_params_['subsample'],
                                   max_depth=grid_search.best_params_['max_depth'],
                                   colsample_bytree=grid_search.best_params_['colsample_bytree'],
                                   min_child_weight=grid_search.best_params_['min_child_weight'],
                                   learning_rate=grid_search.best_params_['learning_rate'],
                                   eta=grid_search.best_params_['eta'])
xgbc_tuned3.fit(X,y)

print('\n')

## results F1 and stdF1 ##
avgF1set3_baselineXGBoost = cross_val_score(xgbc_tuned3, X, y, scoring="f1", cv = 10).mean() ##<3## 
stdF1set3_baselineXGBoost = cross_val_score(xgbc_tuned3, X, y, scoring="f1", cv = 10).std() ##<3##
print('Mean F1 Score: ', avgF1set3_baselineXGBoost)
print('std F1 Score: ', stdF1set3_baselineXGBoost)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
{'colsample_bytree': 0.1, 'eta': 0.01, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 20, 'subsample': 0.9}


Mean F1 Score:  0.7996536796536796
std F1 Score:  0.02055142088233746


In [19]:
# tuned baseline model_ parameterset4

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = xgbc, param_grid = paramXGBoost_grid4,
                           cv = 5, n_jobs = -1, verbose = 2,scoring='f1')
# Fit the grid search to the data
grid_search.fit(X,y)


print(grid_search.best_params_)


xgbc_tuned4 = XGBClassifier(subsample=grid_search.best_params_['subsample'],
                                   max_depth=grid_search.best_params_['max_depth'],
                                   colsample_bytree=grid_search.best_params_['colsample_bytree'],
                                   min_child_weight=grid_search.best_params_['min_child_weight'],
                                   learning_rate=grid_search.best_params_['learning_rate'],
                                   eta=grid_search.best_params_['eta'])
xgbc_tuned4.fit(X,y)

## results F1 and stdF1 ##
avgF1set4_baselineXGBoost = cross_val_score(xgbc_tuned4, X, y, scoring="f1", cv = 10).mean() ##<3## 
stdF1set4_baselineXGBoost = cross_val_score(xgbc_tuned4, X, y, scoring="f1", cv = 10).std() ##<3##
print('Mean F1 Score: ', avgF1set4_baselineXGBoost)
print('std F1 Score: ', stdF1set4_baselineXGBoost)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
{'colsample_bytree': 0.1, 'eta': 0.01, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 5, 'subsample': 1}
Mean F1 Score:  0.9017390289449114
std F1 Score:  0.04974087795519576


In [20]:
output_basedlineXGBoost = {'n_baselineXGBoost':n_baselineXGBoost ,
                   'avgF1_baselineXGBoost':avgF1_baselineXGBoost ,
                   'stdF1_baselineXGBoost': stdF1_baselineXGBoost,
                   'avgF1set1_baselineXGBoost':avgF1set1_baselineXGBoost  ,
                   'stdF1set1_baselineXGBoost': stdF1set1_baselineXGBoost,
                   'avgF1set2_baselineXGBoost':avgF1set2_baselineXGBoost ,
                   'stdF1set2_baselineXGBoost':stdF1set2_baselineXGBoost ,
                   'avgF1set3_baselineXGBoost':avgF1set3_baselineXGBoost ,
                   'stdF1set3_baselineXGBoost':stdF1set3_baselineXGBoost ,
                   'avgF1set4_baselineXGBoost':avgF1set4_baselineXGBoost ,
                   'stdF1set4_baselineXGBoost':stdF1set4_baselineXGBoost ,
                     }
output_basedlineXGBoost

{'n_baselineXGBoost': 312,
 'avgF1_baselineXGBoost': 0.8643578643578642,
 'stdF1_baselineXGBoost': 0.04772011171804006,
 'avgF1set1_baselineXGBoost': 0.9003102453102454,
 'stdF1set1_baselineXGBoost': 0.05479680704989319,
 'avgF1set2_baselineXGBoost': 0.8961062303709362,
 'stdF1set2_baselineXGBoost': 0.06205463659536259,
 'avgF1set3_baselineXGBoost': 0.7996536796536796,
 'stdF1set3_baselineXGBoost': 0.02055142088233746,
 'avgF1set4_baselineXGBoost': 0.9017390289449114,
 'stdF1set4_baselineXGBoost': 0.04974087795519576}

In [21]:
output_basedlineXGBoost = pd.DataFrame([output_basedlineXGBoost])
output_basedlineXGBoost = output_basedlineXGBoost.transpose()
output_basedlineXGBoost.to_csv('output_basedlineXGBoost.csv', index=True)

## Feature Selection

# Filter

In [22]:
# Mutual Information: Filter method

mi_XGBoostscore = MIC(X,y)
mi_XGBoostfeatures = pd.DataFrame({'Feature':list(X_.columns),
                            'value_MI':mi_XGBoostscore})
mi_XGBoostfeatures['value_MI'] = mi_XGBoostfeatures['value_MI'].map('{:,.19f}'.format)
""" the more Mutual Information is the more feature importance it will have"""
mi_XGBoostfeatures = mi_XGBoostfeatures.sort_values(by='value_MI',ascending=False)
# mi_XGBoostfeatures = mi_XGBoostfeatures.set_index('Feature')
mi_XGBoostfeatures = mi_XGBoostfeatures.reset_index()
mi_XGBoostfeatures.to_csv('mi_XGBoostfeatures.csv', index=True)

mi_XGBoostfeatures

Unnamed: 0,index,Feature,value_MI
0,0,Jitter->F0_abs_dif,0.2471220337085198171
1,9,Jitter->F0_TKEO_prc5,0.2159629565098533099
2,10,Jitter->F0_TKEO_prc25,0.2136084295259312871
3,143,entropy_shannon_5_coef,0.2070833897803643353
4,152,entropy_log_4_coef,0.2011992950764724952
...,...,...,...
307,75,VFER->NSR_SEO,0.0000000000000000000
308,269,det_TKEO_std3_10_coef,0.0000000000000000000
309,74,VFER->NSR_TKEO,0.0000000000000000000
310,69,VFER->entropy,0.0000000000000000000


In [23]:
# variance treshold: Filter method
"""Variance threshold is a simple method for feature selection that removes all features 
whose variance doesn't meet a certain threshold."""

threshold = 0.5
selector = VarianceThreshold(threshold)
selector = selector.fit(X)
variances = selector.variances_

variancethres_XGBoostfeatures = pd.DataFrame({'Feature':list(X_.columns),
                            'value_variancethres':variances})
# variancethres_XGBoostfeatures['value_variancethres'] = variancethres_XGBoostfeatures['value_variancethres'].map('{:,.19f}'.format)


"""The idea behind this method is that features 
with low variance don't contain much information and can be removed without affecting 
the performance of the model."""
variancethres_XGBoostfeatures = variancethres_XGBoostfeatures.sort_values(by='value_variancethres',ascending=False)
# variancethres_XGBoostfeatures = variancethres_XGBoostfeatures.set_index('Feature')
variancethres_XGBoostfeatures = variancethres_XGBoostfeatures.reset_index()
variancethres_XGBoostfeatures.to_csv('variancethres_XGBoostfeatures.csv', index=True)

variancethres_XGBoostfeatures

Unnamed: 0,index,Feature,value_variancethres
0,219,Ea2,1.000000
1,110,delta delta log energy,1.000000
2,161,det_TKEO_mean_3_coef,1.000000
3,72,VFER->SNR_SEO,1.000000
4,259,det_TKEO_mean3_10_coef,1.000000
...,...,...,...
307,163,det_TKEO_mean_5_coef,1.000000
308,169,det_TKEO_std_1_coef,1.000000
309,226,Ed2_7_coef,1.000000
310,127,Data_length,1.000000


In [24]:
# MultiSURF: Filter method

features = list(X_.columns)

msrf = MultiSURFSelector(n_features=len(df1.columns)-1)
selected_cols = msrf.select(df1,'class')

list_of_numbers = list(range(1, len(df1.columns)))

MultiSURF_XGBoostfeatures = pd.DataFrame({'Feature':selected_cols,
                            'value_MultiSURF':list_of_numbers})
# # MultiSURF_XGBoostfeatures['value_MultiSURF'] = MultiSURF_XGBoostfeatures['value_MultiSURF'].map('{:,.19f}'.format)
# เรียงจากเลขน้อยคือสำคัญสูงสุด
MultiSURF_XGBoostfeatures = MultiSURF_XGBoostfeatures.sort_values(by='value_MultiSURF',ascending=True)
# MultiSURF_XGBoostfeatures = MultiSURF_XGBoostfeatures.set_index('Feature')
MultiSURF_XGBoostfeatures = MultiSURF_XGBoostfeatures.reset_index()

MultiSURF_XGBoostfeatures.to_csv('MultiSURF_XGBoostfeatures.csv', index=True)

MultiSURF_XGBoostfeatures

Unnamed: 0,index,Feature,value_MultiSURF
0,0,Data_length,1
1,1,entropy_shannon_8_coef,2
2,2,entropy_shannon3_8_coef,3
3,3,det_TKEO_mean_7_coef,4
4,4,entropy_shannon_7_coef,5
...,...,...,...
307,307,10th delta,308
308,308,MFCC_8th coef,309
309,309,5th delta,310
310,310,4th delta,311


# Wrapper

In [25]:
# SHAP: Wrapper method


# Explain model predictions using shap library:
explainer = shap.TreeExplainer(xgbc_tuned4) #### 
shap_values = explainer.shap_values(X)
vals= np.abs(shap_values).mean(0)

SHAP_XGBoostfeatures= pd.DataFrame({'Feature':list(X_.columns),
                                 'value_SHAP':vals})
SHAP_XGBoostfeatures['value_SHAP'] = SHAP_XGBoostfeatures['value_SHAP'].map('{:,.19f}'.format)
SHAP_XGBoostfeatures = SHAP_XGBoostfeatures.sort_values(by=['value_SHAP'],ascending=False)
SHAP_XGBoostfeatures = SHAP_XGBoostfeatures.set_index('Feature')
SHAP_XGBoostfeatures.to_csv('SHAP_XGBoostfeatures.csv', index=True)

SHAP_XGBoostfeatures

Unnamed: 0_level_0,value_SHAP
Feature,Unnamed: 1_level_1
MFCC_2nd coef,0.3918479681015014648
GNE->NSR_SEO,0.3068278729915618896
Log energy,0.2385229170322418213
F0_series_F0_expTitze,0.2027625888586044312
Jitter->pitch_PQ5_classical_Schoentgen,0.1940937191247940063
...,...
Ed_3_coef,0.0000000000000000000
Ed_4_coef,0.0000000000000000000
Ed_5_coef,0.0000000000000000000
Ed_6_coef,0.0000000000000000000


In [26]:
# Recursive Feature Elimination(RFE): Wrapper method


"""In this example, the parameter "n_features_to_select" is set to 1, 
which means that only one feature will be selected at each iteration 
of RFE"""

rfe = RFE(estimator=xgbc_tuned4, n_features_to_select=1) #### changehere
rfe.fit(X,y)

rfe_XGBoostfeatures = pd.DataFrame({'Feature':list(X_.columns),
                            'value_rfe':rfe.ranking_})

# rfe_XGBoostfeatures['value_XGBooste'] = rfe_XGBoostfeatures['value_XGBooste'].astype(int)
# rfe_XGBoostfeatures['value_XGBooste'] = rfe_XGBoostfeatures['value_XGBooste'].map('{:,.19f}'.format)
"""The RFE assigns a ranking value to each feature, typically starting from 1 for 
the most important feature, and increasing for the less important features."""
rfe_XGBoostfeatures = rfe_XGBoostfeatures.sort_values(by=['value_rfe'],ascending=True)
rfe_XGBoostfeatures = rfe_XGBoostfeatures.set_index('Feature')

rfe_XGBoostfeatures.to_csv('rfe_XGBoostfeatures.csv', index=True)

rfe_XGBoostfeatures

Unnamed: 0_level_0,value_rfe
Feature,Unnamed: 1_level_1
entropy_log_4_coef,1
MFCC_2nd coef,2
Jitter->pitch_PQ5_classical_Schoentgen,3
entropy_shannon_5_coef,4
prc95_5_F0_series_F0_expTitze,5
...,...
entropy_log_5_coef,308
entropy_log_7_coef,309
entropy_log_6_coef,310
Age,311


In [27]:
# Boruta: Wrapper method




feat_selector = BorutaPy(xgbc_tuned4, n_estimators='auto', verbose=2, random_state=1) #### changehere
# feat_selector.fit(X_train, y_train)
feat_selector = feat_selector.fit(np.array(X), np.array(y)) 
features = X.columns
feature_importance = feat_selector.ranking_
# feature_importance
boruta_XGBoostfeatures = pd.DataFrame(list(zip(X_.columns, feature_importance[0:].tolist())), 
                                  columns=['Feature','value_boruta'])

# output the feature importance ranking: 1 is most importance >ascending=True
boruta_XGBoostfeatures = boruta_XGBoostfeatures.sort_values(by=['value_boruta'],ascending=True)
boruta_XGBoostfeatures = boruta_XGBoostfeatures.set_index('Feature')

boruta_XGBoostfeatures.to_csv('boruta_XGBoostfeatures.csv', index=True)

boruta_XGBoostfeatures

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	312
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	312
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	312
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	312
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	312
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	312
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	312
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	250
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	250
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	250
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	250
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	53
Rejected: 	259
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	53
Rejected: 	259
Iteration: 	14 / 100
Confirmed: 	0
Tentative: 	53
Rejected: 	259
Iteration: 	15 / 100
Confirmed: 	0
Tentative: 	53
Rejected: 	259
Iteration: 	16 / 100
Confirmed: 	0
Tentat

Unnamed: 0_level_0,value_boruta
Feature,Unnamed: 1_level_1
Jitter->F0_abs_dif,1
entropy_log_4_coef,1
std_F0_series_F0_expTitze,1
det_TKEO_mean_8_coef,1
DFA,1
...,...
entropy_shannon_4_coef,189
entropy_shannon_6_coef,189
entropy_log_1_coef,189
6th delta,189


# single FS

# Filter

In [28]:
method_Filter = {'MI': mi_XGBoostfeatures,
          'variancethres': variancethres_XGBoostfeatures,
          'MultiSURF': MultiSURF_XGBoostfeatures
         }

In [29]:
output_filter = dict()


for tres,t in thres_d.items():
    for met,m in method_Filter.items():
        k =(filter_arr(m.filter(regex='value').squeeze(), t-1))   
        if k[1] != 0:
            n = k[1]+1 
            print("n"+"_single"+met+"_"+tres+"XGBoost",n)
            output_filter["n"+"_single"+met+"_"+tres+"XGBoost"] = n
            df_selected = m.iloc[:k[1]+1]
            df_filter = df1.filter(items=df_selected['Feature'])
            X = df_filter
            y = df1['class']
            
            ## before tuned ##
            xgbc = XGBClassifier(random_state=123)
            xgbc.fit(X,y)
            ## results F1 and stdF1 ##
            m = cross_val_score(xgbc, X, y, scoring="f1", cv = 10).mean() 
            s = cross_val_score(xgbc, X, y, scoring="f1", cv = 10).std() 
            print("avgF1"+"_single"+met+"_"+tres+"XGBoost" , m)
            print("stdF1"+"_single"+met+"_"+tres+"XGBoost",s)
            output_filter["avgF1"+"_single"+met+"_"+tres+"XGBoost"] = m
            output_filter["stdF1"+"_single"+met+"_"+tres+"XGBoost"] = s
            
            for parm, p in param_d.items():
                # Instantiate the grid search model
                grid_search = GridSearchCV(estimator = xgbc, param_grid = p,
                                        cv = 5, n_jobs = -1, verbose = 2,scoring='f1')
                # Fit the grid search to the data
                grid_search.fit(X,y)
        #         print(grid_search.best_params_)
                xgbc_tuned = XGBClassifier(subsample=grid_search.best_params_['subsample'],
                                                max_depth=grid_search.best_params_['max_depth'],
                                                colsample_bytree=grid_search.best_params_['colsample_bytree'],
                                                min_child_weight=grid_search.best_params_['min_child_weight'],
                                                learning_rate=grid_search.best_params_['learning_rate'],
                                                eta=grid_search.best_params_['eta'])
                xgbc_tuned.fit(X,y)

                ## results F1 and stdF1 ##
                m_tune  = cross_val_score(xgbc_tuned, X, y, scoring="f1", cv = 10).mean() 
                s_tune  = cross_val_score(xgbc_tuned, X, y, scoring="f1", cv = 10).std() 
                print("avgF1"+parm+"_single"+met+"_"+tres+"XGBoost" , m_tune)
                print("stdF1"+parm+"_single"+met+"_"+tres+"XGBoost",s_tune)
                output_filter["avgF1"+parm+"_single"+met+"_"+tres+"XGBoost"] = m_tune
                output_filter["stdF1"+parm+"_single"+met+"_"+tres+"XGBoost"] = s_tune

        else:
            output_filter["n"+"_single"+met+"_"+tres+"XGBoost"] = k[1]
            output_filter["avgF1"+"_single"+met+"_"+tres+"XGBoost"] = k[1]
            output_filter["stdF1"+"_single"+met+"_"+tres+"XGBoost"] = k[1]
            for parm, p in param_d.items():
                output_filter["avgF1"+parm+"_single"+met+"_"+tres+"XGBoost"] = k[1]
                output_filter["stdF1"+parm+"_single"+met+"_"+tres+"XGBoost"] = k[1]
         

        


n_singleMI_base2nXGBoost 7
avgF1_singleMI_base2nXGBoost 0.7720696443219662
stdF1_singleMI_base2nXGBoost 0.13859793622453048
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
avgF1set1_singleMI_base2nXGBoost 0.8276521053765634
stdF1set1_singleMI_base2nXGBoost 0.11723963541498901
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
avgF1set2_singleMI_base2nXGBoost 0.8368250377073906
stdF1set2_singleMI_base2nXGBoost 0.11536066772061342
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
avgF1set3_singleMI_base2nXGBoost 0.7996536796536796
stdF1set3_singleMI_base2nXGBoost 0.02055142088233746
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
avgF1set4_singleMI_base2nXGBoost 0.8489923337756153
stdF1set4_singleMI_base2nXGBoost 0.1287265356043338
n_singlevariancethres_base2nXGBoost 3
avgF1_singlevariancethres_base2nXGBoost 0.8139094058676102
stdF1_singlevariancethres_base2nXGBoost 0.07767029649821591
Fitting 5 folds for each of 729 candidates, 

In [30]:
output_filter

{'n_singleMI_base2nXGBoost': 7,
 'avgF1_singleMI_base2nXGBoost': 0.7720696443219662,
 'stdF1_singleMI_base2nXGBoost': 0.13859793622453048,
 'avgF1set1_singleMI_base2nXGBoost': 0.8276521053765634,
 'stdF1set1_singleMI_base2nXGBoost': 0.11723963541498901,
 'avgF1set2_singleMI_base2nXGBoost': 0.8368250377073906,
 'stdF1set2_singleMI_base2nXGBoost': 0.11536066772061342,
 'avgF1set3_singleMI_base2nXGBoost': 0.7996536796536796,
 'stdF1set3_singleMI_base2nXGBoost': 0.02055142088233746,
 'avgF1set4_singleMI_base2nXGBoost': 0.8489923337756153,
 'stdF1set4_singleMI_base2nXGBoost': 0.1287265356043338,
 'n_singlevariancethres_base2nXGBoost': 3,
 'avgF1_singlevariancethres_base2nXGBoost': 0.8139094058676102,
 'stdF1_singlevariancethres_base2nXGBoost': 0.07767029649821591,
 'avgF1set1_singlevariancethres_base2nXGBoost': 0.8442378338895367,
 'stdF1set1_singlevariancethres_base2nXGBoost': 0.046392444546284016,
 'avgF1set2_singlevariancethres_base2nXGBoost': 0.8379108645052918,
 'stdF1set2_singlevarian

In [31]:

output_single_Filter_XGBoost = pd.DataFrame([output_filter])
output_single_Filter_XGBoost = output_single_Filter_XGBoost.transpose()
output_single_Filter_XGBoost.to_csv('output_single_Filter_XGBoost.csv', index=True)

# Wrapper

In [32]:
SHAP_XGBoostfeatures = pd.read_csv('SHAP_XGBoostfeatures.csv')
rfe_XGBoostfeatures = pd.read_csv('rfe_XGBoostfeatures.csv')
boruta_XGBoostfeatures = pd.read_csv('boruta_XGBoostfeatures.csv')

In [33]:
method_Wrapper = {  'SHAP':SHAP_XGBoostfeatures,
                    'rfe': rfe_XGBoostfeatures,
                    'boruta': boruta_XGBoostfeatures
                }

In [34]:
output_wrapper = dict()


for tres,t in thres_d.items():
    for met,m in method_Wrapper.items():
        k =(filter_arr(m.filter(regex='value').squeeze(), t-1))   
        if k[1] != 0:
            n = k[1]+1 
            print("n"+"_single"+met+"_"+tres+"XGBoost",n)
            output_wrapper["n"+"_single"+met+"_"+tres+"XGBoost"] = n
            df_selected = m.iloc[:k[1]+1]
            df_filter = df1.filter(items=df_selected['Feature'])
            X = df_filter
            y = df1['class']
            
            ## before tuned ##
            xgbc = XGBClassifier(random_state=123)
            xgbc.fit(X,y)
            ## results F1 and stdF1 ##
            m = cross_val_score(xgbc, X, y, scoring="f1", cv = 10).mean() 
            s = cross_val_score(xgbc, X, y, scoring="f1", cv = 10).std() 
            print("avgF1"+"_single"+met+"_"+tres+"XGBoost" , m)
            print("stdF1"+"_single"+met+"_"+tres+"XGBoost",s)
            output_wrapper["avgF1"+"_single"+met+"_"+tres+"XGBoost"] = m
            output_wrapper["stdF1"+"_single"+met+"_"+tres+"XGBoost"] = s
            
            for parm, p in param_d.items():
                # Instantiate the grid search model
                grid_search = GridSearchCV(estimator = xgbc, param_grid = p,
                                        cv = 5, n_jobs = -1, verbose = 2,scoring='f1')
                # Fit the grid search to the data
                grid_search.fit(X,y)
        #         print(grid_search.best_params_)
                xgbc_tuned = XGBClassifier(subsample=grid_search.best_params_['subsample'],
                                                max_depth=grid_search.best_params_['max_depth'],
                                                colsample_bytree=grid_search.best_params_['colsample_bytree'],
                                                min_child_weight=grid_search.best_params_['min_child_weight'],
                                                learning_rate=grid_search.best_params_['learning_rate'],
                                                eta=grid_search.best_params_['eta'])
                xgbc_tuned.fit(X,y)

                ## results F1 and stdF1 ##
                m_tune  = cross_val_score(xgbc_tuned, X, y, scoring="f1", cv = 10).mean() 
                s_tune  = cross_val_score(xgbc_tuned, X, y, scoring="f1", cv = 10).std() 
                print("avgF1"+parm+"_single"+met+"_"+tres+"XGBoost" , m_tune)
                print("stdF1"+parm+"_single"+met+"_"+tres+"XGBoost",s_tune)
                output_wrapper["avgF1"+parm+"_single"+met+"_"+tres+"XGBoost"] = m_tune
                output_wrapper["stdF1"+parm+"_single"+met+"_"+tres+"XGBoost"] = s_tune

        else:
            output_wrapper["n"+"_single"+met+"_"+tres+"XGBoost"] = k[1]
            output_wrapper["avgF1"+"_single"+met+"_"+tres+"XGBoost"] = k[1]
            output_wrapper["stdF1"+"_single"+met+"_"+tres+"XGBoost"] = k[1]
            for parm, p in param_d.items():
                output_wrapper["avgF1"+parm+"_single"+met+"_"+tres+"XGBoost"] = k[1]
                output_wrapper["stdF1"+parm+"_single"+met+"_"+tres+"XGBoost"] = k[1]
         

        


n_singleSHAP_base2nXGBoost 7
avgF1_singleSHAP_base2nXGBoost 0.9203479286451423
stdF1_singleSHAP_base2nXGBoost 0.044146955569844305
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
avgF1set1_singleSHAP_base2nXGBoost 0.9003186888790605
stdF1set1_singleSHAP_base2nXGBoost 0.05253382766289704
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
avgF1set2_singleSHAP_base2nXGBoost 0.9119007076514816
stdF1set2_singleSHAP_base2nXGBoost 0.04303135105353047
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
avgF1set3_singleSHAP_base2nXGBoost 0.7996536796536796
stdF1set3_singleSHAP_base2nXGBoost 0.02055142088233746
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
avgF1set4_singleSHAP_base2nXGBoost 0.9194707356626862
stdF1set4_singleSHAP_base2nXGBoost 0.027564712853517293
n_singlerfe_base2nXGBoost 7
avgF1_singlerfe_base2nXGBoost 0.8978527294858566
stdF1_singlerfe_base2nXGBoost 0.06221018551553827
Fitting 5 folds for each of 729 candidates, total

In [35]:
output_wrapper

{'n_singleSHAP_base2nXGBoost': 7,
 'avgF1_singleSHAP_base2nXGBoost': 0.9203479286451423,
 'stdF1_singleSHAP_base2nXGBoost': 0.044146955569844305,
 'avgF1set1_singleSHAP_base2nXGBoost': 0.9003186888790605,
 'stdF1set1_singleSHAP_base2nXGBoost': 0.05253382766289704,
 'avgF1set2_singleSHAP_base2nXGBoost': 0.9119007076514816,
 'stdF1set2_singleSHAP_base2nXGBoost': 0.04303135105353047,
 'avgF1set3_singleSHAP_base2nXGBoost': 0.7996536796536796,
 'stdF1set3_singleSHAP_base2nXGBoost': 0.02055142088233746,
 'avgF1set4_singleSHAP_base2nXGBoost': 0.9194707356626862,
 'stdF1set4_singleSHAP_base2nXGBoost': 0.027564712853517293,
 'n_singlerfe_base2nXGBoost': 7,
 'avgF1_singlerfe_base2nXGBoost': 0.8978527294858566,
 'stdF1_singlerfe_base2nXGBoost': 0.06221018551553827,
 'avgF1set1_singlerfe_base2nXGBoost': 0.8978527294858566,
 'stdF1set1_singlerfe_base2nXGBoost': 0.06221018551553827,
 'avgF1set2_singlerfe_base2nXGBoost': 0.904704776647501,
 'stdF1set2_singlerfe_base2nXGBoost': 0.05444276242063975,
 '

In [36]:

output_single_wrapper_XGBoost = pd.DataFrame([output_wrapper])
output_single_wrapper_XGBoost = output_single_wrapper_XGBoost.transpose()
output_single_wrapper_XGBoost.to_csv('output_single_wrapper_XGBoost.csv', index=True)