In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from keras.optimizers import Adam

from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_absolute_error

import numpy as np

In [2]:
params = {
    'test_size': 0.2,
    'random_state': 1337,
    'validation_split': 0.2
}

In [3]:
df = pd.read_parquet("../data/qsc_out.random_scan_nfp2.parquet").sample(20000)

In [4]:
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,y0,y1,y2,y3,y4,y5,y6
927814,-0.042842,0.074952,0.000228,-0.008788,0.003354,0.000826,-0.107395,0.010778,265.493225,3.413386,0.03465,16.252445,0.877191,2.68266,0.312259
23124,-0.025451,0.014247,-0.000548,0.000191,0.003645,0.003318,-0.568982,0.089715,6.476036,1.286592,0.352464,0.525667,0.810975,2.62115,0.306909
235954,0.170515,-0.055282,0.017492,-0.00225,0.000192,-0.000542,-1.013491,0.026091,2.253073,21.323753,4.451335,0.437557,0.912871,4.965256,0.354264
290526,-0.116894,0.127822,-0.008796,0.01035,0.015036,0.00016,-0.217962,1.757503,7.21599,11.345546,3.885552,6.981246,2.735054,8.526979,0.337327
745104,-0.010574,-0.045799,0.00013,-0.001762,0.000166,-0.001131,-2.870485,-4.229006,142.002274,1.214765,0.084997,1.166556,0.693579,1.546168,0.303115


In [5]:
df.shape

(20000, 15)

In [6]:
x_columns = [col for col in df.columns if col.startswith('x')]
y_columns = [col for col in df.columns if col.startswith('y')]

Y = df[y_columns].values
X = df[x_columns].values

In [7]:
def preprocess_data(X_train, X_test, Y_train, Y_test, params):
    scaler_x = StandardScaler()
    scaler_x = scaler_x.fit(X_train)
    scaler_y = StandardScaler()
    scaler_y = scaler_y.fit(Y_train)
    X_train = scaler_x.transform(X_train)
    X_test = scaler_x.transform(X_test)
    Y_train = scaler_y.transform(Y_train)
    Y_test = scaler_y.transform(Y_test)

    input_shape = X_train.shape[1]
    
    output_shape = Y_train.shape[1]
    return X_train, X_test, Y_train, Y_test, input_shape, output_shape, scaler_x, scaler_y

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=params['test_size'], 
                                                    random_state=params['random_state'])

X_train, X_test, Y_train, Y_test, input_shape, output_shape, scaler_x, scaler_y = preprocess_data(X_train, X_test, Y_train, Y_test, params)

In [8]:
X_train.mean(axis=0), X_train.std(axis=0) 

(array([-2.1532178e-09, -8.4266070e-09, -1.6019621e-09,  8.1360341e-09,
         2.0712614e-09, -4.1774473e-09, -4.5262278e-09,  1.1421420e-08],
       dtype=float32),
 array([1.0000015 , 0.99999946, 0.9999916 , 0.999992  , 0.9999958 ,
        0.9999953 , 1.        , 0.9999921 ], dtype=float32))

## Appears to be drift, perhaps the sample is not big enough

In [9]:
X_test.mean(axis=0), X_test.std(axis=0) 

(array([ 0.01566574, -0.01935916,  0.02933775, -0.03737441, -0.02744949,
         0.02566588,  0.0023744 ,  0.00630145], dtype=float32),
 array([1.0110551 , 1.0071383 , 0.9933347 , 1.0196439 , 1.0422928 ,
        0.99796194, 0.9964985 , 1.0558548 ], dtype=float32))

In [10]:
Y_train.mean(axis=0), Y_train.std(axis=0) 

(array([-7.4319542e-09,  3.9115547e-10, -8.8382510e-09,  1.0300427e-09,
        -1.4256686e-08,  2.6724010e-09,  2.4039299e-08], dtype=float32),
 array([0.9999978 , 1.0000006 , 0.9999992 , 1.0000019 , 1.        ,
        1.0000012 , 0.99999946], dtype=float32))

In [11]:
Y_test.mean(axis=0), Y_test.std(axis=0)

(array([-0.02615544,  0.02592533,  0.01617737,  0.01231691,  0.02729797,
         0.02005751, -0.00557732], dtype=float32),
 array([0.9778238, 1.0593427, 1.0234559, 1.0446359, 1.0161207, 1.0485954,
        0.914684 ], dtype=float32))

## Dummy regressor

In [12]:
from sklearn.dummy import DummyRegressor

In [13]:
regr = MultiOutputRegressor(DummyRegressor(strategy="mean")).fit(X_train, Y_train)
regr.predict(X_train)



array([[ 1.9073487e-09,  9.5367436e-10, -5.9604646e-09, ...,
        -4.0531161e-09,  4.7683718e-10,  4.7683715e-09],
       [ 1.9073487e-09,  9.5367436e-10, -5.9604646e-09, ...,
        -4.0531161e-09,  4.7683718e-10,  4.7683715e-09],
       [ 1.9073487e-09,  9.5367436e-10, -5.9604646e-09, ...,
        -4.0531161e-09,  4.7683718e-10,  4.7683715e-09],
       ...,
       [ 1.9073487e-09,  9.5367436e-10, -5.9604646e-09, ...,
        -4.0531161e-09,  4.7683718e-10,  4.7683715e-09],
       [ 1.9073487e-09,  9.5367436e-10, -5.9604646e-09, ...,
        -4.0531161e-09,  4.7683718e-10,  4.7683715e-09],
       [ 1.9073487e-09,  9.5367436e-10, -5.9604646e-09, ...,
        -4.0531161e-09,  4.7683718e-10,  4.7683715e-09]], dtype=float32)

In [14]:
mean_absolute_error(Y_train, regr.predict(X_train))

0.63199794

In [15]:
mean_absolute_error(Y_test, regr.predict(X_test))

0.6350416

## Train a linear regression for debugging

In [16]:
regr = MultiOutputRegressor(Ridge(random_state=params['random_state'])).fit(X_train, Y_train)
regr.predict(X_train)

array([[ 0.2668218 ,  0.49785665, -0.14110266, ..., -0.21119246,
        -0.11503971,  0.08385517],
       [-0.13136226, -0.21774325,  0.03752474, ...,  0.08587919,
         0.03905897, -0.05397544],
       [-0.12290849, -0.24603355,  0.10089954, ...,  0.11824025,
         0.06466386, -0.02495952],
       ...,
       [-0.12654151, -0.24451499,  0.07418114, ...,  0.10212538,
         0.06103366, -0.02823034],
       [-0.09874371, -0.20784873,  0.0917204 , ...,  0.11425355,
         0.07847044, -0.07360035],
       [ 0.15325338,  0.33014053, -0.10994104, ..., -0.10436093,
        -0.08484264, -0.10878628]], dtype=float32)

In [17]:
mean_absolute_error(Y_train, regr.predict(X_train))

0.61378956

In [18]:
mean_absolute_error(Y_test, regr.predict(X_test))

0.6197761

## Lazy regressor

In [19]:
from lazypredict.Supervised import LazyRegressor
from sklearn.metrics import mean_absolute_error

reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=mean_absolute_error)
models, predictions = reg.fit(X_train, X_test, Y_train, Y_test)

print(models)

  0%|                                                                                                                                                     | 0/42 [00:00<?, ?it/s]

AdaBoostRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.


 10%|█████████████▍                                                                                                                               | 4/42 [00:01<00:11,  3.21it/s]

BayesianRidge model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
ElasticNetCV model failed to execute
For multi-task outputs, use MultiTaskElasticNetCV


 21%|██████████████████████████████▏                                                                                                              | 9/42 [00:05<00:20,  1.59it/s]

GammaRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.


 36%|██████████████████████████████████████████████████                                                                                          | 15/42 [00:27<00:54,  2.01s/it]

GradientBoostingRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
HistGradientBoostingRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
HuberRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.


 40%|████████████████████████████████████████████████████████▋                                                                                   | 17/42 [00:40<01:17,  3.09s/it]

LarsCV model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
LassoCV model failed to execute
For multi-task outputs, use MultiTaskLassoCV
LassoLarsCV model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
LassoLarsIC model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
LinearSVR model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.


 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 33/42 [00:44<00:07,  1.14it/s]

NuSVR model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
OrthogonalMatchingPursuitCV model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
PassiveAggressiveRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
PoissonRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
QuantileRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.


 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 36/42 [00:56<00:08,  1.49s/it]

SGDRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
SVR model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
TweedieRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [01:00<00:00,  1.45s/it]

LGBMRegressor model failed to execute
y should be a 1d array, got an array of shape (16000, 7) instead.
                            Adjusted R-Squared  R-Squared   RMSE  Time Taken  \
Model                                                                          
ExtraTreesRegressor                       0.66       0.66   0.60        3.84   
RandomForestRegressor                     0.64       0.64   0.62       11.98   
XGBRegressor                              0.64       0.64   0.62        4.36   
BaggingRegressor                          0.61       0.61   0.64        1.21   
MLPRegressor                              0.59       0.59   0.65        3.54   
KNeighborsRegressor                       0.51       0.51   0.72        0.18   
DecisionTreeRegressor                     0.29       0.29   0.87        0.20   
ExtraTreeRegressor                        0.24       0.24   0.90        0.04   
RidgeCV                                   0.04       0.04   0.99        0.01   
Ridge           


