In [1]:
from imbd.data import DataLoader, DataPreprocessor
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_validate

In [2]:
loader = DataLoader()

In [3]:
df = loader.build_label_20_df()

In [4]:
preprocessor = DataPreprocessor()

In [5]:
train_features = df.drop(loader.labels, axis=1)
train_labels = df[loader.labels]

In [6]:
train_features = preprocessor(train_features)

[Pipeline] ... (step 1 of 3) Processing features_select, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing quantization, total=   0.1s
[Pipeline] ........... (step 3 of 3) Processing fill_na, total=   0.2s


In [7]:
model = MultiOutputRegressor(XGBRegressor())
scorer = make_scorer(mean_squared_error, squared=False)

In [8]:
res = cross_validate(model, train_features, train_labels, cv=10, scoring=scorer, return_estimator=True)

In [9]:
res['test_score']

array([0.09345449, 0.15541898, 0.14906102, 0.18398673, 0.2227047 ,
       0.10122411, 0.18028061, 0.07327433, 0.08376163, 0.08404379])

In [10]:
best_estimator = res['estimator'][np.argmax(res['test_score'])]

In [11]:
pred = best_estimator.predict(train_features)

In [12]:
res = np.abs(pred - train_labels)
res['mean_abs_diff'] = np.mean(res, axis=1)

In [13]:
res.sort_values('mean_abs_diff', ascending=False).to_csv('output.csv', index=False)

In [14]:
import plotly.express as px

In [15]:
target = pd.concat([train_features[train_features.columns[24:48]], res['mean_abs_diff']], axis=1)
target.columns

Index(['Input_A2_004', 'Input_A2_005', 'Input_A2_006', 'Input_A2_007',
       'Input_A2_009', 'Input_A2_011', 'Input_A2_012', 'Input_A2_013',
       'Input_A2_014', 'Input_A2_015', 'Input_A2_018', 'Input_A2_019',
       'Input_A2_020', 'Input_A2_021', 'Input_A2_022', 'Input_A2_023',
       'Input_A3_001', 'Input_A3_002', 'Input_A3_003', 'Input_A3_004',
       'Input_A3_005', 'Input_A3_006', 'Input_A3_007', 'Input_A3_008',
       'mean_abs_diff'],
      dtype='object')

In [16]:
px.parallel_coordinates(target, color='mean_abs_diff', color_continuous_scale=px.colors.diverging.Tealrose)