In [2]:
from imbd.data import DataLoader, DataPreprocessor
from imbd.training import ModelTrainer
import numpy as np
import pandas as pd

In [3]:
loader = DataLoader()

In [4]:
df = loader.build(data_type='label_20')

In [5]:
preprocessor = DataPreprocessor()

In [6]:
train_features = df.drop(loader.labels, axis=1)
train_labels = df[loader.labels]

In [7]:
train_features = preprocessor(train_features)

[Pipeline] ... (step 1 of 5) Processing features_select, total=   0.2s
[Pipeline] ...... (step 2 of 5) Processing quantization, total=   0.2s
[Pipeline] ........... (step 3 of 5) Processing fill_na, total=   0.1s
[Pipeline] . (step 4 of 5) Processing variance_selector, total=   0.0s
[Pipeline] . (step 5 of 5) Processing outlier_detection, total=   0.1s


In [8]:
param_grid = {
"estimator__n_estimators": [1000],
"estimator__max_depth": [2, 5, 10],
# "estimator__alpha": [0, 0.1],
# "estimator__lambda": [1, 0.5],
"estimator__subsample": [1, 0.5],
# "estimator__gamma": [0, 2],
}

trainer = ModelTrainer(param_grid=param_grid)
trainer.train(train_features, train_labels)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] estimator__max_depth=2, estimator__n_estimators=1000, estimator__subsample=1 
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  estimator__max_depth=2, estimator__n_estimators=1000, estimator__subsample=1, total=   8.4s
[CV] estimator__max_depth=2, estimator__n_estimators=1000, estimator__subsample=1 
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.4s remaining:    0.0s
[CV]  estimator__max_depth=2, estimator__n_estimators=1000, estimator__subsample=1, total=   9.6s
[CV] estimator__max_depth=2, estimator__n_estimators=1000, estimator__subsample=1 
[CV]  estimator__max_depth=2, estimator__n_estimators=1000, estimator__subsample=1, total=   8.0s
[CV] estimator__max_depth=2, estimator__n_estimators=1000, estimator__subsample=0.5 
[CV]  estimator__max_depth=2, estimator__n_estimators=1000, estimator__subsample=0.5, total=  10.3s
[CV] estimator__max_depth=2, estimator__n_estimator

GridSearchCV(cv=3,
             estimator=MultiOutputRegressor(estimator=XGBRegressor(base_score=None,
                                                                   booster=None,
                                                                   colsample_bylevel=None,
                                                                   colsample_bynode=None,
                                                                   colsample_bytree=None,
                                                                   gamma=None,
                                                                   gpu_id=None,
                                                                   importance_type='gain',
                                                                   interaction_constraints=None,
                                                                   learning_rate=None,
                                                                   max_delta_step=None,
                       

In [9]:
trainer.training_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__max_depth,param_estimator__n_estimators,param_estimator__subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,8.633772,0.649166,0.03378,0.002525,2,1000,1.0,"{'estimator__max_depth': 2, 'estimator__n_esti...",0.151139,0.185689,0.109918,0.148915,0.030973,2
1,9.024456,0.882759,0.039885,0.003083,2,1000,0.5,"{'estimator__max_depth': 2, 'estimator__n_esti...",0.164471,0.182591,0.115417,0.15416,0.028376,1
2,8.148944,1.148155,0.031957,0.00283,5,1000,1.0,"{'estimator__max_depth': 5, 'estimator__n_esti...",0.149009,0.188953,0.097245,0.145069,0.037543,4
3,7.632787,0.106976,0.033077,0.00094,5,1000,0.5,"{'estimator__max_depth': 5, 'estimator__n_esti...",0.145274,0.169932,0.101922,0.139043,0.028112,5
4,9.495433,1.783757,0.033476,0.00164,10,1000,1.0,"{'estimator__max_depth': 10, 'estimator__n_est...",0.15271,0.198517,0.094049,0.148425,0.042756,3
5,7.779977,1.446588,0.031412,0.000642,10,1000,0.5,"{'estimator__max_depth': 10, 'estimator__n_est...",0.148859,0.166153,0.100756,0.13859,0.027668,6


In [10]:
pred = trainer.predict(train_features)
pred = np.round(pred, 2)

In [11]:
res = np.abs(pred - train_labels)
res['mean_abs_diff'] = np.mean(res, axis=1)
res['outlier'] = train_features['outlier']


In [12]:
res.sort_values('mean_abs_diff', ascending=False).to_csv('output.csv', index=False)

In [14]:
import plotly.express as px

In [15]:
# columns = train_features.filter(regex='Input_C_[0-9]+').columns
columns = train_features.filter(regex='(Input_A*|Output_A[0-9]+)').columns
target = pd.concat([train_features[columns], train_features['outlier'], res['mean_abs_diff']], axis=1)
# target = pd.concat([train_features[columns[:20]], res['mean_abs_diff']], axis=1)
target.columns

Index(['Input_A2_020', 'Input_A3_001', 'Input_A3_020', 'Input_A4_020',
       'Input_A5_005', 'Input_A5_020', 'Input_A6_020', 'Input_C_135',
       'Input_C_136', 'Input_C_137',
       ...
       'Input_C_079_x', 'Input_C_079_y', 'Input_C_080_x', 'Input_C_080_y',
       'Input_C_081_x', 'Input_C_081_y', 'Input_C_082_x', 'Input_C_082_y',
       'outlier', 'mean_abs_diff'],
      dtype='object', length=106)

In [16]:
px.parallel_coordinates(target, color='mean_abs_diff', color_continuous_scale=px.colors.diverging.Tealrose)

In [17]:
pred_df = pd.DataFrame(pred, columns=train_labels.columns)

In [18]:
test = pd.concat([pred_df['Input_A1_020'], train_labels['Input_A1_020']], axis=1)

In [19]:
res.sort_values('mean_abs_diff', ascending=False)

Unnamed: 0,Input_A6_024,Input_A3_016,Input_C_013,Input_A2_016,Input_A3_017,Input_C_050,Input_A6_001,Input_C_096,Input_A3_018,Input_A6_019,...,Input_A3_015,Input_C_046,Input_C_049,Input_A2_024,Input_C_058,Input_C_057,Input_A3_013,Input_A2_017,mean_abs_diff,outlier
5,2.235174e-10,2.235174e-10,0.005,6.705523e-10,2.235174e-10,0.0020,0.0,0.000000e+00,4.470348e-10,4.470348e-10,...,4.000000e-03,0.0012,0.0004,1.000000e-03,0.0050,0.0050,0.004,6.705523e-10,0.001530,1
162,0.000000e+00,7.450581e-10,0.004,8.940697e-10,7.450581e-10,0.0050,0.0,2.235174e-10,8.940697e-10,8.940697e-10,...,6.705523e-10,0.0011,0.0009,2.235174e-10,0.0040,0.0040,0.006,8.940697e-10,0.001400,1
257,4.470348e-10,6.705523e-10,0.005,4.470348e-10,6.705523e-10,0.0050,0.0,4.470348e-10,6.705523e-10,4.470348e-10,...,3.000001e-03,0.0004,0.0006,4.470348e-10,0.0030,0.0030,0.004,4.470348e-10,0.001400,1
1,2.235174e-10,4.470348e-10,0.004,6.705523e-10,2.235174e-10,0.0050,0.0,0.000000e+00,6.705523e-10,4.470348e-10,...,6.705523e-10,0.0009,0.0006,2.235174e-10,0.0040,0.0050,0.004,4.470348e-10,0.001375,1
49,4.470348e-10,2.235174e-10,0.005,2.235174e-10,2.235174e-10,0.0050,0.0,2.235174e-10,2.235174e-10,2.235174e-10,...,7.450581e-10,0.0008,0.0007,2.235174e-10,0.0050,0.0030,0.004,2.235174e-10,0.001375,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293,2.235174e-10,2.235174e-10,0.005,4.470348e-10,2.235174e-10,0.0009,0.0,2.235174e-10,2.235174e-10,4.470348e-10,...,4.470348e-10,0.0004,0.0004,2.235174e-10,0.0006,0.0004,0.004,2.235174e-10,0.000585,1
302,2.235174e-10,4.470348e-10,0.004,4.470348e-10,2.235174e-10,0.0010,0.0,2.235174e-10,2.235174e-10,4.470348e-10,...,2.235174e-10,0.0003,0.0007,2.235174e-10,0.0008,0.0003,0.002,4.470348e-10,0.000555,1
297,0.000000e+00,6.705523e-10,0.005,4.470348e-10,4.470348e-10,0.0012,0.0,2.235174e-10,6.705523e-10,6.705523e-10,...,6.705523e-10,0.0008,0.0003,2.235174e-10,0.0006,0.0008,0.002,2.235174e-10,0.000535,1
300,2.235174e-10,4.470348e-10,0.004,4.470348e-10,6.705523e-10,0.0010,0.0,2.235174e-10,4.470348e-10,4.470348e-10,...,4.470348e-10,0.0003,0.0002,2.235174e-10,0.0006,0.0003,0.002,4.470348e-10,0.000520,1


In [20]:
len(train_features.columns)

105

In [21]:
fi = pd.Series(trainer.model.best_estimator_.estimators_[0].feature_importances_, index=train_features.columns).sort_values(ascending=False)

In [22]:
fi.to_csv('feature_importances.csv', header=None)

In [28]:
with open('fi.md', 'w') as file:
    file.write(fi.to_markdown())

In [32]:
with open('train_result.md', 'w') as file:
    file.write(trainer.training_result.drop('params', axis=1).sort_values('rank_test_score').to_markdown())