In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn import feature_selection
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from scipy import stats
from feature_engine.selection import SelectBySingleFeaturePerformance

In [2]:
df = pd.read_csv('dev_train.csv')
df = df[df.columns[2:]]
df.columns = [i.strip() for i in df.columns]

In [3]:
y = df[df.columns[0]]
X = df[df.columns[1:]]

In [4]:
X

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0.600741,0.673626,0.670432,0.615525,0.614977,0.999207,0.798059,0.809964,0.304196,0.782286,...,0.868946,0.007478,0.624790,0.615520,0.844167,0.276214,0.026791,0.565159,1,0.073384
1,0.480183,0.545028,0.531345,0.600902,0.600902,0.999035,0.797417,0.809330,0.303434,0.781602,...,0.799502,0.006413,0.622152,0.600904,0.840529,0.287872,0.028049,0.567897,1,0.017570
2,0.541413,0.615733,0.601585,0.605169,0.605169,0.999069,0.797515,0.809434,0.303536,0.781702,...,0.843483,0.000319,0.624424,0.605167,0.842898,0.276730,0.026791,0.565158,1,0.057494
3,0.462341,0.523877,0.512126,0.601962,0.602538,0.998969,0.797255,0.809172,0.303291,0.781481,...,0.791869,0.006227,0.623493,0.601958,0.839785,0.277092,0.026425,0.562485,1,0.050382
4,0.522547,0.583679,0.565555,0.603554,0.603554,0.999061,0.797494,0.809387,0.303515,0.781648,...,0.822404,0.001205,0.624300,0.603550,0.842597,0.281994,0.026928,0.565734,1,0.023469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,0.493199,0.534289,0.539590,0.610437,0.610437,0.998955,0.797399,0.809320,0.303570,0.781578,...,0.796847,0.001590,0.622744,0.610433,0.840095,0.276779,0.028218,0.568053,1,0.056379
3337,0.503924,0.546282,0.556614,0.602581,0.602581,0.998977,0.797407,0.809349,0.303537,0.781623,...,0.804379,0.005958,0.623137,0.602576,0.840829,0.281551,0.027089,0.566265,1,0.024324
3338,0.579681,0.632523,0.620322,0.644215,0.644215,0.999297,0.797949,0.809750,0.303816,0.782045,...,0.840079,0.000716,0.624140,0.644214,0.843738,0.280141,0.026801,0.565204,1,0.027962
3339,0.526057,0.601068,0.575673,0.609024,0.609089,0.999132,0.797810,0.809665,0.303919,0.781883,...,0.837170,0.012565,0.623716,0.609020,0.842199,0.275613,0.026802,0.565211,1,0.114083


In [5]:
y.head()

0    0
1    1
2    0
3    0
4    0
Name: Bankrupt?, dtype: int64

## Feature engineering

**Creating new features**:
- ROA_(C/A/B)_Debt(Equity/Asset)_Interaction - an interaction between *Return on Assets* and leverage ratios could indicate how effectively a company uses borrowed funds to generate profits, a crucial factor in financial health
- Profitability_Liquidity_Ratio - *Operating Profit Rate*/*Current Ratio* could gauge how operating profits stand against current assets, highlighting potential cash flow issues
- Interests_coverage_adjustments - adjusting *Interest Coverage Ratio* by multiplying with *Working Capital to Total Assets* to gauge how working capital might cushion interest payments
- CashFlow_to_Debt_Ratio - new feature combining *Cash Flow to Total Assets* with *Total debt/Total net worth* might better reflect the ability to cover debts

In [6]:
X['our_ROA_C_DebtEquity_Interaction'] = df[df.columns[1]] * df[df.columns[36]]
X['our_ROA_C_DebtAsset_Interaction'] = df[df.columns[1]] * df[df.columns[37]]
X['our_ROA_A_DebtEquity_Interaction'] = df[df.columns[2]] * df[df.columns[36]]
X['our_ROA_A_DebtAsset_Interaction'] = df[df.columns[2]] * df[df.columns[37]]
X['our_ROA_B_DebtEquity_Interaction'] = df[df.columns[3]] * df[df.columns[36]]
X['our_ROA_B_DebtAsset_Interaction'] = df[df.columns[3]] * df[df.columns[37]]

X['our_Profitability_Liquidity_Ratio'] = np.divide(df[df.columns[6]], df[df.columns[33]], out=np.zeros_like(df[df.columns[6]]), where=df[df.columns[33]] != 0)

X['our_Interests_coverage_adjustments'] = df[df.columns[93]] * df[df.columns[54]]

X['our_CashFlow_to_Debt_Ratio'] = np.divide(df[df.columns[80]], df[df.columns[36]], out=np.zeros_like(df[df.columns[80]]), where=df[df.columns[36]] != 0)

In [7]:
cols = X.columns
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=cols)
X = X.drop(columns=['Net Income Flag', 'Liability-Assets Flag'])

In [8]:
X

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Equity to Liability,our_ROA_C_DebtEquity_Interaction,our_ROA_C_DebtAsset_Interaction,our_ROA_A_DebtEquity_Interaction,our_ROA_A_DebtAsset_Interaction,our_ROA_B_DebtEquity_Interaction,our_ROA_B_DebtAsset_Interaction,our_Profitability_Liquidity_Ratio,our_Interests_coverage_adjustments,our_CashFlow_to_Debt_Ratio
0,1.590092,1.784798,1.921670,0.483378,0.449832,0.054893,0.080610,0.067291,0.124714,0.082836,...,0.480488,-0.031746,-0.994607,-0.031256,-0.973893,-0.031745,-0.972055,-0.033592,-0.583768,0.355199
1,-0.402372,-0.192739,-0.354871,-0.436031,-0.435872,0.029452,0.024815,0.023132,0.012209,0.022437,...,-0.575505,-0.031746,1.522828,-0.031256,1.618574,-0.031745,1.552685,0.029866,0.221399,-0.496295
2,0.609571,0.894529,0.794791,-0.167775,-0.167394,0.034410,0.033359,0.030380,0.027222,0.031278,...,0.179846,-0.031746,-0.803270,-0.031256,-0.759551,-0.031745,-0.784119,-0.605498,2.216809,0.210362
3,-0.697253,-0.517997,-0.669451,-0.369420,-0.332925,0.019569,0.010743,0.012074,-0.008936,0.011748,...,0.045298,-0.031746,-0.843734,-0.031256,-0.806358,-0.031745,-0.829482,0.527645,-1.092753,0.002053
4,0.297770,0.401612,0.205063,-0.269277,-0.268980,0.033213,0.031472,0.027105,0.024119,0.026528,...,-0.463898,-0.031746,0.966600,-0.031256,0.996780,-0.031745,0.921395,-0.136141,0.600015,-0.401602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,-0.187253,-0.357883,-0.219926,0.163466,0.164121,0.017586,0.023228,0.022379,0.032250,0.020329,...,0.158742,-0.031746,-0.906813,-0.031256,-0.931278,-0.031745,-0.909775,3.662833,-1.288714,0.098892
3337,-0.010002,-0.173458,0.058727,-0.330450,-0.330204,0.020869,0.023913,0.024402,0.027436,0.024297,...,-0.447732,-0.031746,0.756950,-0.031256,0.696184,-0.031745,0.775841,1.107732,-1.509340,-0.394969
3338,1.242035,1.152724,1.101485,2.287304,2.289721,0.068109,0.071020,0.052378,0.068658,0.061545,...,-0.378902,-0.031746,0.808570,-0.031256,0.766257,-0.031745,0.731721,-0.525491,0.949578,-0.266732
3339,0.355780,0.669028,0.370678,0.074651,0.079315,0.043728,0.058967,0.046454,0.083741,0.047261,...,1.250505,-0.031746,-1.561961,-0.031256,-1.534367,-0.031745,-1.562038,-0.084611,-0.699112,1.127521


## Feature selection

**Select by single feature performance** - selects features based on the performance of a machine learning model trained utilising a single feature

In [9]:
sing_feat_perf = SelectBySingleFeaturePerformance(
    estimator=RandomForestClassifier(random_state=1),
    scoring='roc_auc',
    cv=3,
    threshold=None
)
sing_feat_perf.fit(X, y)
X_sing_feat_perf = sing_feat_perf.transform(X)
X_sing_feat_perf

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Equity to Liability,our_Profitability_Liquidity_Ratio,our_Interests_coverage_adjustments,our_CashFlow_to_Debt_Ratio
0,1.590092,1.784798,1.921670,0.483378,0.449832,0.054893,0.080610,0.067291,0.124714,0.082836,...,1.515307,0.483201,0.208196,-0.236128,-0.052763,0.003054,0.480488,-0.033592,-0.583768,0.355199
1,-0.402372,-0.192739,-0.354871,-0.436031,-0.435872,0.029452,0.024815,0.023132,0.012209,0.022437,...,-0.184277,-0.435818,0.017508,0.395327,0.006118,0.184126,-0.575505,0.029866,0.221399,-0.496295
2,0.609571,0.894529,0.794791,-0.167775,-0.167394,0.034410,0.033359,0.030380,0.027222,0.031278,...,0.892134,-0.167791,0.141656,-0.208214,-0.052765,0.003039,0.179846,-0.605498,2.216809,0.210362
3,-0.697253,-0.517997,-0.669451,-0.369420,-0.332925,0.019569,0.010743,0.012074,-0.008936,0.011748,...,-0.371086,-0.369522,-0.021506,-0.188564,-0.069910,-0.173740,0.045298,0.527645,-1.092753,0.002053
4,0.297770,0.401612,0.205063,-0.269277,-0.268980,0.033213,0.031472,0.027105,0.024119,0.026528,...,0.376237,-0.269450,0.125915,0.076939,-0.046382,0.041129,-0.463898,-0.136141,0.600015,-0.401602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,-0.187253,-0.357883,-0.219926,0.163466,0.164121,0.017586,0.023228,0.022379,0.032250,0.020329,...,-0.249238,0.163332,-0.005266,-0.205532,0.014002,0.194443,0.158742,3.662833,-1.288714,0.098892
3337,-0.010002,-0.173458,0.058727,-0.330450,-0.330204,0.020869,0.023913,0.024402,0.027436,0.024297,...,-0.064895,-0.330675,0.033247,0.052923,-0.038840,0.076241,-0.447732,1.107732,-1.509340,-0.394969
3338,1.242035,1.152724,1.101485,2.287304,2.289721,0.068109,0.071020,0.052378,0.068658,0.061545,...,0.808821,2.287331,0.185705,-0.023422,-0.052311,0.006074,-0.378902,-0.525491,0.949578,-0.266732
3339,0.355780,0.669028,0.370678,0.074651,0.079315,0.043728,0.058967,0.046454,0.083741,0.047261,...,0.737607,0.074509,0.105025,-0.268666,-0.052248,0.006492,1.250505,-0.084611,-0.699112,1.127521


**p-value** - select with p-value below 0.05

In [10]:
cols_to_test = X.columns[:-9]

def t_test_for_feature(feature, df):
    group1 = df[df['Bankrupt?'] == 0][feature]
    group2 = df[df['Bankrupt?'] == 1][feature]
    t_stat, p_val = stats.ttest_ind(group1, group2)
    print(f"T-statistic for {feature}: {t_stat}, P-value: {p_val}")
    return p_val <= 0.05

smaller_p_vals_than_alpha = [t_test_for_feature(f, df) for f in cols_to_test] + [True for i in range(9)]
X_t_test = X.loc[:, smaller_p_vals_than_alpha]
X_t_test

T-statistic for ROA(C) before interest and depreciation before interest: 16.16099397231704, P-value: 1.2674758285996096e-56
T-statistic for ROA(A) before interest and % after tax: 17.953189484055414, P-value: 7.104331472625995e-69
T-statistic for ROA(B) before interest and depreciation after tax: 16.96342117131794, P-value: 5.626835760253952e-62
T-statistic for Operating Gross Margin: 7.633853738719521, P-value: 2.9547193268398404e-14
T-statistic for Realized Sales Gross Margin: 7.593553986993562, P-value: 4.014979491051391e-14
T-statistic for Operating Profit Rate: 0.11874888516050038, P-value: 0.9054814490160445
T-statistic for Pre-tax net Interest Rate: 0.28962985063616997, P-value: 0.7721174106534199
T-statistic for After-tax net Interest Rate: 0.17051300690550106, P-value: 0.8646170370154678
T-statistic for Non-industry income and expenditure/revenue: 0.611799738478212, P-value: 0.5407119164945315
T-statistic for Continuous interest rate (after tax): 0.28353290122273933, P-value: 

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Research and development expense rate,Cash flow rate,Tax rate (A),Net Value Per Share (B),Net Value Per Share (A),...,Equity to Liability,our_ROA_C_DebtEquity_Interaction,our_ROA_C_DebtAsset_Interaction,our_ROA_A_DebtEquity_Interaction,our_ROA_A_DebtAsset_Interaction,our_ROA_B_DebtEquity_Interaction,our_ROA_B_DebtAsset_Interaction,our_Profitability_Liquidity_Ratio,our_Interests_coverage_adjustments,our_CashFlow_to_Debt_Ratio
0,1.590092,1.784798,1.921670,0.483378,0.449832,2.415194,0.461188,-0.808236,5.157655,5.134578,...,0.480488,-0.031746,-0.994607,-0.031256,-0.973893,-0.031745,-0.972055,-0.033592,-0.583768,0.355199
1,-0.402372,-0.192739,-0.354871,-0.436031,-0.435872,-0.749114,-0.347515,1.484898,0.102253,0.103740,...,-0.575505,-0.031746,1.522828,-0.031256,1.618574,-0.031745,1.552685,0.029866,0.221399,-0.496295
2,0.609571,0.894529,0.794791,-0.167775,-0.167394,1.254051,0.583860,-0.808236,-0.018989,-0.016914,...,0.179846,-0.031746,-0.803270,-0.031256,-0.759551,-0.031745,-0.784119,-0.605498,2.216809,0.210362
3,-0.697253,-0.517997,-0.669451,-0.369420,-0.332925,-0.598780,0.140507,-0.808236,-0.366369,-0.362606,...,0.045298,-0.031746,-0.843734,-0.031256,-0.806358,-0.031745,-0.829482,0.527645,-1.092753,0.002053
4,0.297770,0.401612,0.205063,-0.269277,-0.268980,-0.356939,-0.267797,0.973470,-0.212432,-0.209417,...,-0.463898,-0.031746,0.966600,-0.031256,0.996780,-0.031745,0.921395,-0.136141,0.600015,-0.401602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,-0.187253,-0.357883,-0.219926,0.163466,0.164121,-0.749114,0.166984,1.132164,-0.607492,-0.602557,...,0.158742,-0.031746,-0.906813,-0.031256,-0.931278,-0.031745,-0.909775,3.662833,-1.288714,0.098892
3337,-0.010002,-0.173458,0.058727,-0.330450,-0.330204,0.027545,-0.196651,-0.808236,0.532732,0.532127,...,-0.447732,-0.031746,0.756950,-0.031256,0.696184,-0.031745,0.775841,1.107732,-1.509340,-0.394969
3338,1.242035,1.152724,1.101485,2.287304,2.289721,-0.749114,2.924725,0.431745,0.196250,0.197280,...,-0.378902,-0.031746,0.808570,-0.031256,0.766257,-0.031745,0.731721,-0.525491,0.949578,-0.266732
3339,0.355780,0.669028,0.370678,0.074651,0.079315,-0.345405,0.695293,0.018418,0.273900,0.274552,...,1.250505,-0.031746,-1.561961,-0.031256,-1.534367,-0.031745,-1.562038,-0.084611,-0.699112,1.127521


**Recursive feature elimination** - Given an external estimator that assigns weights to features, the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features

In [11]:
model = LogisticRegression(max_iter=1000)
rfe = feature_selection.RFE(estimator=model, n_features_to_select=None)
rfe.fit(X, y)
X_rfe = rfe.transform(X)
X_rfe = pd.DataFrame(X_rfe, columns=rfe.get_feature_names_out())
X_rfe

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,Cash flow rate,Tax rate (A),Net Value Per Share (B),Net Value Per Share (C),...,Cash Flow to Equity,Current Liability to Current Assets,Net Income to Total Assets,Net Income to Stockholder's Equity,Liability to Equity,our_ROA_C_DebtAsset_Interaction,our_ROA_A_DebtAsset_Interaction,our_ROA_B_DebtAsset_Interaction,our_Profitability_Liquidity_Ratio,our_CashFlow_to_Debt_Ratio
0,1.590092,1.784798,1.921670,0.449832,0.054893,0.080610,0.461188,-0.808236,5.157655,5.132081,...,-0.203516,-0.019504,1.515307,0.208196,-0.236128,-0.994607,-0.973893,-0.972055,-0.033592,0.355199
1,-0.402372,-0.192739,-0.354871,-0.435872,0.029452,0.024815,-0.347515,1.484898,0.102253,0.102354,...,-0.265501,0.049012,-0.184277,0.017508,0.395327,1.522828,1.618574,1.552685,0.029866,-0.496295
2,0.609571,0.894529,0.794791,-0.167394,0.034410,0.033359,0.583860,-0.808236,-0.018989,-0.018272,...,0.825635,-0.648957,0.892134,0.141656,-0.208214,-0.803270,-0.759551,-0.784119,-0.605498,0.210362
3,-0.697253,-0.517997,-0.669451,-0.332925,0.019569,0.010743,0.140507,-0.808236,-0.366369,-0.317806,...,-0.096366,0.574706,-0.371086,-0.021506,-0.188564,-0.843734,-0.806358,-0.829482,0.527645,0.002053
4,0.297770,0.401612,0.205063,-0.268980,0.033213,0.031472,-0.267797,0.973470,-0.212432,-0.210733,...,0.275060,-0.130416,0.376237,0.125915,0.076939,0.966600,0.996780,0.921395,-0.136141,-0.401602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,-0.187253,-0.357883,-0.219926,0.164121,0.017586,0.023228,0.166984,1.132164,-0.607492,-0.603786,...,-0.074063,3.515160,-0.249238,-0.005266,-0.205532,-0.906813,-0.931278,-0.909775,3.662833,0.098892
3337,-0.010002,-0.173458,0.058727,-0.330204,0.020869,0.023913,-0.196651,-0.808236,0.532732,0.530647,...,-0.072757,1.165325,-0.064895,0.033247,0.052923,0.756950,0.696184,0.775841,1.107732,-0.394969
3338,1.242035,1.152724,1.101485,2.289721,0.068109,0.071020,2.924725,0.431745,0.196250,0.195874,...,2.644652,-0.559454,0.808821,0.185705,-0.023422,0.808570,0.766257,0.731721,-0.525491,-0.266732
3339,0.355780,0.669028,0.370678,0.079315,0.043728,0.058967,0.695293,0.018418,0.273900,0.273129,...,0.101145,-0.074578,0.737607,0.105025,-0.268666,-1.561961,-1.534367,-1.562038,-0.084611,1.127521


**Best features from above selections**

In [12]:
common_columns = X_rfe.columns.intersection(X_t_test.columns).intersection(X_sing_feat_perf.columns)
X_selected = X[common_columns]
X_selected

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Realized Sales Gross Margin,Tax rate (A),Net Value Per Share (B),Net Value Per Share (C),Persistent EPS in the Last Four Seasons,Per Share Net profit before tax (Yuan ¥),After-tax Net Profit Growth Rate,...,Net profit before tax/Paid-in capital,Working Capital to Total Assets,Working Capital/Equity,Cash Flow to Liability,Current Liability to Current Assets,Net Income to Total Assets,Net Income to Stockholder's Equity,Liability to Equity,our_Profitability_Liquidity_Ratio,our_CashFlow_to_Debt_Ratio
0,1.590092,1.784798,1.921670,0.449832,-0.808236,5.157655,5.132081,4.025152,3.012140,0.025353,...,3.216051,-0.621425,-0.155593,-0.257451,-0.019504,1.515307,0.208196,-0.236128,-0.033592,0.355199
1,-0.402372,-0.192739,-0.354871,-0.435872,1.484898,0.102253,0.102354,-0.198406,-0.245933,0.016495,...,-0.240324,0.166190,0.322613,-0.094326,0.049012,-0.184277,0.017508,0.395327,0.029866,-0.496295
2,0.609571,0.894529,0.794791,-0.167394,-0.808236,-0.018989,-0.018272,0.437619,0.268624,0.004303,...,0.303525,2.355897,0.441036,0.840001,-0.648957,0.892134,0.141656,-0.208214,-0.605498,0.210362
3,-0.697253,-0.517997,-0.669451,-0.332925,-0.808236,-0.366369,-0.317806,-0.465127,-0.506769,-0.034019,...,-0.483685,-1.101781,-0.245907,-0.093471,0.574706,-0.371086,-0.021506,-0.188564,0.527645,0.002053
4,0.297770,0.401612,0.205063,-0.268980,0.973470,-0.212432,-0.210733,0.250035,0.320791,0.019022,...,0.360905,0.622224,0.301289,0.033727,-0.130416,0.376237,0.125915,0.076939,-0.136141,-0.401602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,-0.187253,-0.357883,-0.219926,0.164121,1.132164,-0.607492,-0.603786,-0.415300,-0.416662,-0.002504,...,-0.423788,-1.434919,-0.312236,-0.076022,3.515160,-0.249238,-0.005266,-0.205532,3.662833,0.098892
3337,-0.010002,-0.173458,0.058727,-0.330204,-0.808236,0.532732,0.530647,-0.034271,-0.283873,0.009211,...,-0.281849,-1.629524,-0.412056,-0.065737,1.165325,-0.064895,0.033247,0.052923,1.107732,-0.394969
3338,1.242035,1.152724,1.101485,2.289721,0.431745,0.196250,0.195874,0.812786,0.778439,0.095104,...,0.785465,1.007479,0.335941,0.971540,-0.559454,0.808821,0.185705,-0.023422,-0.525491,-0.266732
3339,0.355780,0.669028,0.370678,0.079315,0.018418,0.273900,0.273129,0.188484,0.249654,0.010534,...,0.285657,-0.745263,-0.183230,0.298302,-0.074578,0.737607,0.105025,-0.268666,-0.084611,1.127521


**Sequential feature selection** - at each stage, this estimator chooses the best feature to add (forward selection) based on the cross-validation score of an estimator.

In [15]:
knn = KNeighborsClassifier(n_neighbors=3)
sfs = feature_selection.SequentialFeatureSelector(knn)
sfs.fit(X_selected, y)
X_new = pd.DataFrame(sfs.transform(X_selected), columns=sfs.get_feature_names_out())
X_new

Unnamed: 0,Tax rate (A),Net Value Per Share (B),Net Value Per Share (C),After-tax Net Profit Growth Rate,Regular Net Profit Growth Rate,Total Asset Growth Rate,Current Ratio,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Cash Flow to Liability,Net Income to Stockholder's Equity,Liability to Equity,our_CashFlow_to_Debt_Ratio
0,-0.808236,5.157655,5.132081,0.025353,0.026415,1.307121,-0.282321,1.337375,3.216051,-0.257451,0.208196,-0.236128,0.355199
1,1.484898,0.102253,0.102354,0.016495,0.017581,-1.649642,-0.313375,0.069024,-0.240324,-0.094326,0.017508,0.395327,-0.496295
2,-0.808236,-0.018989,-0.018272,0.004303,0.005423,0.623277,0.524030,0.334126,0.303525,0.840001,0.141656,-0.208214,0.210362
3,-0.808236,-0.366369,-0.317806,-0.034019,-0.032790,-1.884152,-0.463559,-0.471518,-0.483685,-0.093471,-0.021506,-0.188564,0.002053
4,0.973470,-0.212432,-0.210733,0.019022,0.020060,0.581832,-0.222025,0.485489,0.360905,0.033727,0.125915,0.076939,-0.401602
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,1.132164,-0.607492,-0.603786,-0.002504,-0.001364,0.177742,-0.663136,-0.480996,-0.423788,-0.076022,-0.005266,-0.205532,0.098892
3337,-0.808236,0.532732,0.530647,0.009211,0.008631,0.792511,-0.546518,-0.341696,-0.281849,-0.065737,0.033247,0.052923,-0.394969
3338,0.431745,0.196250,0.195874,0.095104,0.095968,-1.884152,0.278174,0.511626,0.785465,0.971540,0.185705,-0.023422,-0.266732
3339,0.018418,0.273900,0.273129,0.010534,0.011637,0.291716,-0.254128,-0.082052,0.285657,0.298302,0.105025,-0.268666,1.127521


In [16]:
df_new = pd.concat([y, X_new], axis=1)
df_selected = pd.concat([y, X_selected], axis=1)
df_new.to_csv('train_after_fe_27.csv')
df_selected.to_csv('train_after_fe_13.csv')

**Prepare csv**

In [17]:
# test
df = pd.read_csv('dev_valid.csv')
df = df[df.columns[2:]]
df.columns = [i.strip() for i in df.columns]
y = df[df.columns[0]]
X = df[df.columns[1:]]
##
X['our_ROA_C_DebtEquity_Interaction'] = df[df.columns[1]] * df[df.columns[36]]
X['our_ROA_C_DebtAsset_Interaction'] = df[df.columns[1]] * df[df.columns[37]]
X['our_ROA_A_DebtEquity_Interaction'] = df[df.columns[2]] * df[df.columns[36]]
X['our_ROA_A_DebtAsset_Interaction'] = df[df.columns[2]] * df[df.columns[37]]
X['our_ROA_B_DebtEquity_Interaction'] = df[df.columns[3]] * df[df.columns[36]]
X['our_ROA_B_DebtAsset_Interaction'] = df[df.columns[3]] * df[df.columns[37]]

X['our_Profitability_Liquidity_Ratio'] = np.divide(df[df.columns[6]], df[df.columns[33]], out=np.zeros_like(df[df.columns[6]]), where=df[df.columns[33]] != 0)

X['our_Interests_coverage_adjustments'] = df[df.columns[93]] * df[df.columns[54]]

X['our_CashFlow_to_Debt_Ratio'] = np.divide(df[df.columns[80]], df[df.columns[36]], out=np.zeros_like(df[df.columns[80]]), where=df[df.columns[36]] != 0)

cols = X.columns
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=cols)
X = X.drop(columns=['Net Income Flag', 'Liability-Assets Flag'])

X_sing_feat_perf = sing_feat_perf.transform(X)
X_t_test = X.loc[:, smaller_p_vals_than_alpha]
X_rfe = pd.DataFrame(X_rfe, columns=rfe.get_feature_names_out())
common_columns = X_rfe.columns.intersection(X_t_test.columns).intersection(X_sing_feat_perf.columns)
X_selected = X[common_columns]
X_new = pd.DataFrame(sfs.transform(X_selected), columns=sfs.get_feature_names_out())

df_new = pd.concat([y, X_new], axis=1)
df_selected = pd.concat([y, X_selected], axis=1)

df_new.to_csv('test_after_fe_27.csv')
df_selected.to_csv('test_after_fe_13.csv')

In [18]:
# valid
df = pd.read_csv('valid_test.csv')
df = df[df.columns[1:]]
df.columns = [i.strip() for i in df.columns]
y = df[df.columns[0]]
X = df[df.columns[1:]]
##
X['our_ROA_C_DebtEquity_Interaction'] = df[df.columns[1]] * df[df.columns[36]]
X['our_ROA_C_DebtAsset_Interaction'] = df[df.columns[1]] * df[df.columns[37]]
X['our_ROA_A_DebtEquity_Interaction'] = df[df.columns[2]] * df[df.columns[36]]
X['our_ROA_A_DebtAsset_Interaction'] = df[df.columns[2]] * df[df.columns[37]]
X['our_ROA_B_DebtEquity_Interaction'] = df[df.columns[3]] * df[df.columns[36]]
X['our_ROA_B_DebtAsset_Interaction'] = df[df.columns[3]] * df[df.columns[37]]

X['our_Profitability_Liquidity_Ratio'] = np.divide(df[df.columns[6]], df[df.columns[33]], out=np.zeros_like(df[df.columns[6]]), where=df[df.columns[33]] != 0)

X['our_Interests_coverage_adjustments'] = df[df.columns[93]] * df[df.columns[54]]

X['our_CashFlow_to_Debt_Ratio'] = np.divide(df[df.columns[80]], df[df.columns[36]], out=np.zeros_like(df[df.columns[80]]), where=df[df.columns[36]] != 0)

cols = X.columns
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=cols)
X = X.drop(columns=['Net Income Flag', 'Liability-Assets Flag'])

X_sing_feat_perf = sing_feat_perf.transform(X)
X_t_test = X.loc[:, smaller_p_vals_than_alpha]
X_rfe = pd.DataFrame(X_rfe, columns=rfe.get_feature_names_out())
common_columns = X_rfe.columns.intersection(X_t_test.columns).intersection(X_sing_feat_perf.columns)
X_selected = X[common_columns]
X_new = pd.DataFrame(sfs.transform(X_selected), columns=sfs.get_feature_names_out())

df_new = pd.concat([y, X_new], axis=1)
df_selected = pd.concat([y, X_selected], axis=1)

df_new.to_csv('valid_after_fe_27.csv')
df_selected.to_csv('valid_after_fe_13.csv')