In [33]:
import re
import os
import pandas as pd
from itertools import product
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from tools.clf_hyperparameters import run_inference

Load data

In [21]:
df_rose = pd.read_csv('data/.local/ROSE_data.csv')
df_smote = pd.read_csv('data/.local/SMOTE_data.csv')
df_test = pd.read_csv('data/.local/TEST_data.csv')

In [22]:
df_smote.head()

Unnamed: 0,operating gross margin,operating profit rate,tax rate (a),cash flow per share,revenue per share (yuan ¥),realized sales gross profit growth rate,operating profit growth rate,regular net profit growth rate,continuous net profit growth rate,total asset return growth rate ratio,...,cfo to assets,cash flow to equity,current liability to current assets,net income to total assets,total assets to gnp price,no-credit interval,degree of financial leverage (dfl),interest coverage ratio (interest expense to ebit),equity to liability,bankrupt
0,0.62044,0.999212,0.008807,0.348942,0.029055,0.02211,0.848043,0.689519,0.217607,0.264392,...,0.64473,0.329472,0.010759,0.834316,0.003665,0.625232,0.0268,0.565199,0.080997,0
1,0.613932,0.999043,0.227452,0.320365,0.015715,0.022093,0.848022,0.689338,0.217585,0.263891,...,0.590121,0.316132,0.013225,0.808498,0.00076,0.623952,0.026899,0.565624,0.054215,0
2,0.609132,0.998946,0.0,0.321178,0.00481,0.022295,0.848148,0.689709,0.217629,0.26526,...,0.595232,0.314551,0.072953,0.797465,0.001693,0.623293,0.070125,0.570077,0.041042,0
3,0.58984,0.998842,0.0,0.317429,0.004174,0.022141,0.848053,0.689427,0.217602,0.263962,...,0.56364,0.316418,0.058115,0.779479,0.006058,0.624777,0.026617,0.564167,0.029985,0
4,0.605385,0.999054,0.106387,0.309295,0.03701,0.022118,0.848035,0.689294,0.217598,0.263822,...,0.532495,0.311484,0.025038,0.803455,0.001516,0.624009,0.02718,0.566515,0.018928,0


In [30]:
X_rose = df_rose[df_rose.columns[:-1]].to_numpy()
Y_rose = df_rose['bankrupt'].to_numpy()
print("\tROSE sample:\n", pd.Series(Y_rose).value_counts().values)

X_smote = df_smote[df_smote.columns[:-1]].to_numpy()
Y_smote = df_smote['bankrupt'].to_numpy()
print("\tSMOTE sample:\n", pd.Series(Y_rose).value_counts().values)

X_test = df_test[df_test.columns[:-1]].to_numpy()
Y_test = df_test['bankrupt'].to_numpy()
print("\tTest sample:\n", pd.Series(Y_test).value_counts().values)

	ROSE sample:
 [4556 4556]
	SMOTE sample:
 [4556 4556]
	Test sample:
 [1959   60]


Hyperparameters

In [54]:
rf_params1 = {
    'n_estimators': [10, 40, 80],
    'criterion': ['entropy'],
    'max_depth': [5, 20, 60],
    'min_samples_leaf': [2, 4, 6, 10],
    'max_features': [0.25, 0.5, 0.75]
}

In [55]:
rf_params2 = {
    'n_estimators': [10, 40, 80, 120, 200],
    'criterion': ['entropy'],
    'max_depth': [3, 20, 40, 80, 120],
    'min_samples_leaf': [2, 4, 6, 8, 10, 12],
    'max_features': [0.1, 0.15, 0.25, 0.4]
}

In [57]:
print('1st inference for demonstration purpose - compare iterative and multiprocessing approaches\n')
print("\n".join(re.split('(?<=]),', rf_params1.__str__())))
print("\nNumber of settups:", len(list(product(*list(params1.values())))))

1st inference for demonstration purpose - compare iterative and multiprocessing approaches

{'n_estimators': [10, 40, 80]
 'criterion': ['entropy']
 'max_depth': [5, 20, 60]
 'min_samples_leaf': [2, 4, 6, 10]
 'max_features': [0.25, 0.5, 0.75]}

Number of settups: 108


In [58]:
print('2st (main) inference \n')
print("\n".join(re.split('(?<=]),', rf_params2.__str__())))
print("\nNumber of settups:", len(list(product(*list(params2.values())))))

2st (main) inference 

{'n_estimators': [10, 40, 80, 120, 200]
 'criterion': ['entropy']
 'max_depth': [3, 20, 40, 80, 120]
 'min_samples_leaf': [2, 4, 6, 8, 10, 12]
 'max_features': [0.1, 0.15, 0.25, 0.4]}

Number of settups: 600


Iterative

In [59]:
run_inference(data=(X_rose, X_test, Y_rose, Y_test),
              base_clf=RandomForestClassifier,
              params=rf_params1,
              metrics=('f1-score', 'precision', 'recall'),
              classes=(0, 1),
              output_file=os.path.join('data', '.local', 'RF_it_results.csv'),
              multiprocessing_mode=False)

Number of classifiers to be trained: 108


Iterative method: 100%|██████████| 108/108 [02:49<00:00,  1.57s/it]

@timer:<function iterative at 0x7eff4c8813f0> exec. time: 169.55 s.





Multiprocessing

In [60]:
run_inference(data=(X_rose, X_test, Y_rose, Y_test),
              base_clf=RandomForestClassifier,
              params=rf_params1,
              metrics=('f1-score', 'precision', 'recall'),
              classes=(0, 1),
              output_file=os.path.join('data', '.local', 'RF_mp_results.csv'),
              multiprocessing_mode=True)

Number of classifiers to be trained: 108


Multiprocessing method: 100%|██████████| 108/108 [00:23<00:00,  4.52it/s]

Results writen into data/.local/RF_mp_results.csv.
@timer:<function processes at 0x7eff4c881510> exec. time: 24.11 s.





Main inference

In [61]:
run_inference(data=(X_rose, X_test, Y_rose, Y_test),
              base_clf=RandomForestClassifier,
              params=rf_params2,
              metrics=('f1-score', 'precision', 'recall'),
              classes=(0, 1),
              output_file=os.path.join('data', '.local', 'RF_mp_results.csv'),
              multiprocessing_mode=True)

Number of classifiers to be trained: 600


Multiprocessing method: 100%|██████████| 600/600 [02:06<00:00,  4.76it/s]

Results writen into data/.local/RF_mp_results.csv.
@timer:<function processes at 0x7eff4c881510> exec. time: 126.28 s.





In [82]:
rf_res = pd.read_csv(os.path.join('data', '.local', 'RF_mp_results.csv'), sep=';')
rf_res

Unnamed: 0,n_estimators,criterion,max_depth,min_samples_leaf,max_features,f1-score_0,f1-score_1,precision_0,precision_1,recall_0,recall_1
0,10,entropy,3,2,0.10,0.903873,0.231626,0.995092,0.133676,0.827973,0.866667
1,10,entropy,3,6,0.10,0.903873,0.231626,0.995092,0.133676,0.827973,0.866667
2,10,entropy,3,4,0.10,0.903873,0.231626,0.995092,0.133676,0.827973,0.866667
3,10,entropy,3,8,0.10,0.903873,0.231626,0.995092,0.133676,0.827973,0.866667
4,10,entropy,3,6,0.15,0.923119,0.266319,0.994693,0.157895,0.861154,0.850000
...,...,...,...,...,...,...,...,...,...,...,...
595,200,entropy,120,6,0.40,0.983172,0.431034,0.982170,0.446429,0.984176,0.416667
596,200,entropy,120,12,0.25,0.980483,0.472222,0.986563,0.404762,0.974477,0.566667
597,200,entropy,120,8,0.40,0.982097,0.453125,0.984111,0.426471,0.980092,0.483333
598,200,entropy,120,10,0.40,0.982070,0.477612,0.985604,0.432432,0.978560,0.533333


In [83]:
HPRS = ['n_estimators', 'max_depth', 'min_samples_leaf', 'max_features']

In [84]:
f1_q50 = rf_res['f1-score_1'].quantile(0.5)
f1_q99 = rf_res['f1-score_1'].quantile(0.99)

p_q50 = rf_res['precision_1'].quantile(0.5)
p_q99 = rf_res['precision_1'].quantile(0.99)

r_q50 = rf_res['recall_1'].quantile(0.5)
r_q99 = rf_res['recall_1'].quantile(0.99)

Hyperparameter correlations with f1-score, precision and recall

In [94]:
rf_res[rf_res['f1-score_1'] >= f1_q50][HPRS+['f1-score_1']].corr()['f1-score_1']

n_estimators        0.172035
max_depth           0.070712
min_samples_leaf    0.317756
max_features       -0.209077
f1-score_1          1.000000
Name: f1-score_1, dtype: float64

In [95]:
rf_res[rf_res['precision_1'] >= p_q50][HPRS+['precision_1']].corr()['precision_1']

n_estimators        0.037999
max_depth           0.037534
min_samples_leaf   -0.681280
max_features       -0.299728
precision_1         1.000000
Name: precision_1, dtype: float64

In [96]:
rf_res[rf_res['recall_1'] >= r_q50][HPRS+['recall_1']].corr()['recall_1']

n_estimators        0.000044
max_depth          -0.691022
min_samples_leaf   -0.489642
max_features        0.008182
recall_1            1.000000
Name: recall_1, dtype: float64

Best setups

In [97]:
rf_res[HPRS+['f1-score_1']][rf_res['f1-score_1'] >= f1_q99]

Unnamed: 0,n_estimators,max_depth,min_samples_leaf,max_features,f1-score_1
77,10,80,6,0.1,0.513274
98,10,120,6,0.1,0.513274
171,40,20,10,0.4,0.507463
337,80,120,6,0.1,0.504348
464,120,120,8,0.1,0.504065
533,200,40,8,0.1,0.504065


In [98]:
rf_res[HPRS+['precision_1']][rf_res['precision_1'] >= p_q99]

Unnamed: 0,n_estimators,max_depth,min_samples_leaf,max_features,precision_1
167,40,40,4,0.1,0.609756
187,40,80,2,0.1,0.615385
281,80,40,2,0.1,0.625
380,120,20,2,0.1,0.615385
425,120,80,2,0.1,0.608696
575,200,120,2,0.15,0.653846


In [99]:
rf_res[HPRS+['recall_1']][rf_res['recall_1'] >= r_q99]

Unnamed: 0,n_estimators,max_depth,min_samples_leaf,max_features,recall_1
132,40,3,6,0.25,0.933333
136,40,3,8,0.25,0.933333
248,80,3,8,0.15,0.933333
251,80,3,6,0.25,0.933333
257,80,3,12,0.15,0.933333
368,120,3,8,0.15,0.933333
478,200,3,2,0.15,0.933333
494,200,3,10,0.15,0.933333
496,200,3,12,0.15,0.933333


### XGBoost

In [75]:
xgb_params = {'n_estimators': [4, 8, 12, 36, 80, 120],
              'criterion': ['squared_error'],
              'max_depth': [3, 6, 12, 24, 48],
              'min_samples_leaf': [1, 4, 8, 12],
              'max_features': [round(0.1 * x, 1) for x in range(3, 6)],
             'learning_rate': [0.05, 0.1, 0.15]}
print("\n".join(re.split('(?<=]),', xgb_params.__str__())))
print("\nNumber of settups:", len(list(product(*list(xgb_params.values())))))

{'n_estimators': [4, 8, 12, 36, 80, 120]
 'criterion': ['squared_error']
 'max_depth': [3, 6, 12, 24, 48]
 'min_samples_leaf': [1, 4, 8, 12]
 'max_features': [0.3, 0.4, 0.5]
 'learning_rate': [0.05, 0.1, 0.15]}

Number of settups: 1080


In [None]:
run_inference(data=(X_rose, X_test, Y_rose, Y_test),
              base_clf=GradientBoostingClassifier,
              params=xgb_params,
              metrics=('f1-score', 'precision', 'recall'),
              classes=(0, 1),
              output_file=os.path.join('data', '.local', 'XGB_mp_results.csv'),
              multiprocessing_mode=True)

In [100]:
xgb_res = pd.read_csv(os.path.join('data', '.local', 'XGB_mp_results.csv'), sep=';')
xgb_res.head()

Unnamed: 0,n_estimators,criterion,max_depth,min_samples_leaf,max_features,learning_rate,f1-score_0,f1-score_1,precision_0,precision_1,recall_0,recall_1
0,4,squared_error,3,1,0.3,0.15,0.916348,0.194737,0.992303,0.112121,0.851193,0.74
1,4,squared_error,3,1,0.3,0.05,0.899277,0.184685,0.994462,0.104061,0.820721,0.82
2,4,squared_error,3,4,0.3,0.05,0.899277,0.184685,0.994462,0.104061,0.820721,0.82
3,4,squared_error,3,1,0.3,0.1,0.908439,0.194175,0.993965,0.110497,0.836465,0.8
4,4,squared_error,3,4,0.3,0.1,0.908439,0.194175,0.993965,0.110497,0.836465,0.8


In [101]:
f1_q50 = xgb_res['f1-score_1'].quantile(0.5)
f1_q99 = xgb_res['f1-score_1'].quantile(0.99)

p_q50 = xgb_res['precision_1'].quantile(0.5)
p_q99 = xgb_res['precision_1'].quantile(0.99)

r_q50 = xgb_res['recall_1'].quantile(0.5)
r_q99 = xgb_res['recall_1'].quantile(0.99)

Hyperparameter correlations with f1-score, precision and recall

In [102]:
xgb_res[xgb_res['f1-score_1'] > f1_q50][HPRS+['f1-score_1']].corr()['f1-score_1']

n_estimators       -0.161906
max_depth           0.160192
min_samples_leaf    0.201433
max_features       -0.182501
f1-score_1          1.000000
Name: f1-score_1, dtype: float64

In [103]:
xgb_res[xgb_res['precision_1'] > p_q50][HPRS+['precision_1']].corr()['precision_1']

n_estimators        0.441950
max_depth           0.219821
min_samples_leaf   -0.216065
max_features       -0.181280
precision_1         1.000000
Name: precision_1, dtype: float64

In [104]:
xgb_res[xgb_res['recall_1'] > r_q50][HPRS+['recall_1']].corr()['recall_1']

n_estimators        0.042620
max_depth          -0.570724
min_samples_leaf   -0.152589
max_features       -0.006507
recall_1            1.000000
Name: recall_1, dtype: float64

Best setups

In [105]:
xgb_res[HPRS+['f1-score_1']][xgb_res['f1-score_1'] >= f1_q99]

Unnamed: 0,n_estimators,max_depth,min_samples_leaf,max_features,f1-score_1
178,4,48,8,0.5,0.407767
239,8,6,8,0.4,0.406593
284,8,12,12,0.4,0.418182
361,8,48,12,0.4,0.411215
453,12,12,8,0.4,0.407767
530,12,48,12,0.3,0.408602
636,36,12,12,0.3,0.416667
710,36,48,12,0.3,0.413793
758,80,6,1,0.3,0.42
779,80,6,12,0.3,0.421875


In [107]:
xgb_res[HPRS+['precision_1']][xgb_res['precision_1'] >= p_q99]

Unnamed: 0,n_estimators,max_depth,min_samples_leaf,max_features,precision_1
647,36,24,1,0.3,0.611111
679,36,48,1,0.3,0.625
825,80,24,1,0.3,0.6875
861,80,48,1,0.3,0.642857
875,80,48,4,0.3,0.6
909,80,48,12,0.4,0.619048
980,120,12,4,0.3,0.625
1005,120,24,1,0.3,0.714286
1019,120,24,4,0.3,0.611111
1026,120,24,4,0.5,0.6


In [108]:
xgb_res[HPRS+['recall_1']][xgb_res['recall_1'] >= r_q99]

Unnamed: 0,n_estimators,max_depth,min_samples_leaf,max_features,recall_1
12,4,3,1,0.5,0.88
170,8,3,1,0.3,0.9
176,8,3,1,0.3,0.9
177,8,3,1,0.3,0.9
181,8,3,1,0.4,0.88
186,8,3,4,0.3,0.9
189,8,3,4,0.3,0.88
195,8,3,8,0.3,0.9
200,8,3,8,0.4,0.88
207,8,3,12,0.3,0.88
