In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from keras.optimizers import Adam

from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_absolute_error

In [2]:
params = {
    'test_size': 0.2,
    'random_state': 1337,
    'validation_split': 0.2
}

In [5]:
df = pd.read_parquet("../data/qsc_out.random_scan_nfp2.parquet")

In [6]:
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,y0,y1,y2,y3,y4,y5,y6
0,0.133438,-0.09295,0.011193,-0.005795,0.000771,-0.000122,-0.713354,0.129141,1.482134,0.213466,0.241207,0.475203,0.852697,1.06753,0.342066
1,0.106896,0.10748,0.007474,0.007549,0.000577,0.000333,-0.71844,-0.309748,1.711563,0.178743,0.181471,0.403228,0.821898,1.052911,0.33332
2,0.100286,0.098256,0.006654,0.006411,0.000179,0.000225,-0.775836,0.064299,1.895026,0.154513,0.204983,0.335392,0.807669,0.965668,0.331046
3,0.120435,0.114909,0.009287,0.008526,0.000167,0.000237,-0.767415,0.016524,1.380982,0.196482,0.304152,0.364896,0.833857,1.286199,0.337564
4,-0.12805,0.160498,0.009165,-0.010457,-0.000347,0.0002,-0.719544,0.569769,0.962259,0.304666,0.529468,0.435512,0.834355,1.311465,0.340612


In [9]:
df.shape

(1000000, 15)

In [10]:
x_columns = [col for col in df.columns if col.startswith('x')]
y_columns = [col for col in df.columns if col.startswith('y')]

Y = df[y_columns].values
X = df[x_columns].values

In [11]:
def preprocess_data(X_train, X_test, Y_train, Y_test, params):
    scaler_x = StandardScaler().fit(X_train)
    scaler_y = StandardScaler().fit(Y_train)
    X_train = scaler_x.transform(X_train)
    X_test = scaler_x.transform(X_test)
    Y_train = scaler_y.transform(Y_train)
    Y_test = scaler_y.transform(Y_test)

    input_shape = X_train.shape[1]
    
    output_shape = Y_train.shape[1]
    return X_train, X_test, Y_train, Y_test, input_shape, output_shape, scaler_x, scaler_y

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=params['test_size'], 
                                                    random_state=params['random_state'])

X_train, X_test, Y_train, Y_test, input_shape, output_shape, scaler_x, scaler_y = preprocess_data(X_train, X_test, Y_train, Y_test, params)

In [12]:
X_train.mean(axis=0), X_train.std(axis=0) 

(array([-1.1327621e-08,  2.1925195e-09, -5.5531042e-09,  7.3113129e-09,
        -1.3621244e-09,  3.0783935e-09,  1.1375323e-08, -3.8778780e-09],
       dtype=float32),
 array([0.9993136 , 0.99944884, 0.99930733, 0.9995662 , 0.99942106,
        0.999486  , 0.9999612 , 0.9995944 ], dtype=float32))

## Appears to be drift, perhaps the sample is not big enough

In [13]:
X_test.mean(axis=0), X_test.std(axis=0) 

(array([-0.00667486,  0.00326206, -0.00185904, -0.00032165, -0.00151821,
        -0.0017868 , -0.00261114,  0.00337474], dtype=float32),
 array([0.99684393, 1.0012466 , 1.000381  , 1.002924  , 0.9978705 ,
        0.9981674 , 1.002929  , 1.0042428 ], dtype=float32))

In [14]:
Y_train.mean(axis=0), Y_train.std(axis=0) 

(array([-1.8581934e-09, -5.8630110e-09, -2.2679568e-10,  1.4387071e-10,
         4.0571391e-09,  5.9454890e-09, -1.5963987e-09], dtype=float32),
 array([1.0002788 , 0.9998443 , 0.9994234 , 0.9997594 , 0.99986285,
        0.99981064, 0.9998727 ], dtype=float32))

In [15]:
Y_test.mean(axis=0), Y_test.std(axis=0)

(array([-0.00227338,  0.00395122,  0.00310262,  0.00321028,  0.00143382,
         0.00332455, -0.00154998], dtype=float32),
 array([0.99452734, 1.0021232 , 1.0060604 , 1.008807  , 0.9999831 ,
        1.0016111 , 0.99785244], dtype=float32))

## Dummy regressor

In [16]:
from sklearn.dummy import DummyRegressor

In [17]:
regr = MultiOutputRegressor(DummyRegressor(strategy="mean")).fit(X_train, Y_train)
regr.predict(X_train)

array([[ 5.0783155e-10, -6.2942505e-09,  6.4849853e-10, ...,
        -2.5177003e-09, -1.7833710e-09,  3.0517577e-09],
       [ 5.0783155e-10, -6.2942505e-09,  6.4849853e-10, ...,
        -2.5177003e-09, -1.7833710e-09,  3.0517577e-09],
       [ 5.0783155e-10, -6.2942505e-09,  6.4849853e-10, ...,
        -2.5177003e-09, -1.7833710e-09,  3.0517577e-09],
       ...,
       [ 5.0783155e-10, -6.2942505e-09,  6.4849853e-10, ...,
        -2.5177003e-09, -1.7833710e-09,  3.0517577e-09],
       [ 5.0783155e-10, -6.2942505e-09,  6.4849853e-10, ...,
        -2.5177003e-09, -1.7833710e-09,  3.0517577e-09],
       [ 5.0783155e-10, -6.2942505e-09,  6.4849853e-10, ...,
        -2.5177003e-09, -1.7833710e-09,  3.0517577e-09]], dtype=float32)

In [18]:
mean_absolute_error(Y_train, regr.predict(X_train))

0.63178873

In [19]:
mean_absolute_error(Y_test, regr.predict(X_test))

0.6326293

## Train a linear regression for debugging

In [20]:
regr = MultiOutputRegressor(Ridge(random_state=params['random_state'])).fit(X_train, Y_train)
regr.predict(X_train)

array([[-6.92282617e-02, -1.54025435e-01,  3.67904827e-02, ...,
         8.91883895e-02,  1.03378534e-01, -4.63521540e-01],
       [-2.72226613e-02, -6.77829310e-02, -2.33512698e-03, ...,
         5.69161810e-02,  1.49837554e-01, -8.12186658e-01],
       [-1.34016767e-01, -2.40030453e-01,  7.71415457e-02, ...,
         1.01409532e-01,  5.49193025e-02, -7.21290754e-03],
       ...,
       [ 2.92817801e-01,  5.27068496e-01, -1.71872094e-01, ...,
        -2.38644585e-01, -1.26464874e-01,  2.43872833e-02],
       [ 8.15712567e-03,  1.36674335e-02,  7.52523541e-04, ...,
        -1.09247095e-03, -7.78605090e-03,  9.96023323e-03],
       [-8.90059397e-02, -1.57348022e-01,  3.95330749e-02, ...,
         8.14322233e-02,  4.80963849e-02, -4.81136180e-02]], dtype=float32)

In [21]:
mean_absolute_error(Y_train, regr.predict(X_train))

0.61463964

In [22]:
mean_absolute_error(Y_test, regr.predict(X_test))

0.6148032

## Simplest neural network

In [23]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [24]:
# Define Sequential model with 3 layers
model = keras.Sequential(
    [
        layers.Dense(input_shape, activation="relu", name="layer_in"),
        layers.Dense(128, activation="relu", name="layer2"),
        layers.Dense(64, activation="relu", name="layer3"),
        layers.Dense(output_shape, name="layer_out"),
    ]
)

model(X_train[0:1])

<tf.Tensor: shape=(1, 7), dtype=float32, numpy=
array([[ 0.0311353 ,  0.06876908, -0.1295784 , -0.09838126, -0.34702867,
         0.04909335, -0.15615612]], dtype=float32)>

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 layer_in (Dense)            (1, 8)                    72        
                                                                 
 layer2 (Dense)              (1, 128)                  1152      
                                                                 
 layer3 (Dense)              (1, 64)                   8256      
                                                                 
 layer_out (Dense)           (1, 7)                    455       
                                                                 
Total params: 9935 (38.81 KB)
Trainable params: 9935 (38.81 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
model.compile(
    optimizer=keras.optimizers.RMSprop(),  # Optimizer
    # Loss function to minimize
    loss=keras.losses.MeanAbsoluteError(),
    # List of metrics to monitor
    metrics=[keras.metrics.MeanAbsoluteError()],
)



In [27]:
print("Fit model on training data")
history = model.fit(
    X_train,
    Y_train,
    batch_size=64,
    epochs=400,
    # We pass some validation for
    # monitoring validation loss and metrics
    # at the end of each epoch
    validation_data=(X_test, Y_test),
)

Fit model on training data
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 7

Epoch 96/400
Epoch 97/400
Epoch 98/400
Epoch 99/400
Epoch 100/400
Epoch 101/400
Epoch 102/400
Epoch 103/400
Epoch 104/400
Epoch 105/400
Epoch 106/400
Epoch 107/400
Epoch 108/400
Epoch 109/400
Epoch 110/400
Epoch 111/400
Epoch 112/400
Epoch 113/400
Epoch 114/400
Epoch 115/400
Epoch 116/400
Epoch 117/400
Epoch 118/400
Epoch 119/400
Epoch 120/400
Epoch 121/400
Epoch 122/400
Epoch 123/400
Epoch 124/400
Epoch 125/400
Epoch 126/400
Epoch 127/400
Epoch 128/400
Epoch 129/400
Epoch 130/400
Epoch 131/400
Epoch 132/400
Epoch 133/400
Epoch 134/400
Epoch 135/400
Epoch 136/400
Epoch 137/400
Epoch 138/400
Epoch 139/400
Epoch 140/400
Epoch 141/400
Epoch 142/400
Epoch 143/400
Epoch 144/400
Epoch 145/400
Epoch 146/400
Epoch 147/400
Epoch 148/400
Epoch 149/400
Epoch 150/400
Epoch 151/400
Epoch 152/400
Epoch 153/400
Epoch 154/400
Epoch 155/400
Epoch 156/400
Epoch 157/400
Epoch 158/400
Epoch 159/400
Epoch 160/400
Epoch 161/400
Epoch 162/400
Epoch 163/400
Epoch 164/400
Epoch 165/400
Epoch 166/400
Epoch 167/

Epoch 191/400
Epoch 192/400
Epoch 193/400
Epoch 194/400
Epoch 195/400
Epoch 196/400
Epoch 197/400
Epoch 198/400
Epoch 199/400
Epoch 200/400
Epoch 201/400
Epoch 202/400
Epoch 203/400
Epoch 204/400
Epoch 205/400
Epoch 206/400
Epoch 207/400
Epoch 208/400
Epoch 209/400
Epoch 210/400
Epoch 211/400
Epoch 212/400
Epoch 213/400
Epoch 214/400
Epoch 215/400
Epoch 216/400
Epoch 217/400
Epoch 218/400
Epoch 219/400
Epoch 220/400
Epoch 221/400
Epoch 222/400
Epoch 223/400
Epoch 224/400
Epoch 225/400
Epoch 226/400
Epoch 227/400
Epoch 228/400
Epoch 229/400
Epoch 230/400
Epoch 231/400
Epoch 232/400
Epoch 233/400
Epoch 234/400
Epoch 235/400
Epoch 236/400
Epoch 237/400
Epoch 238/400
Epoch 239/400
Epoch 240/400
Epoch 241/400
Epoch 242/400
Epoch 243/400
Epoch 244/400
Epoch 245/400
Epoch 246/400
Epoch 247/400
Epoch 248/400
Epoch 249/400
Epoch 250/400
Epoch 251/400
Epoch 252/400
Epoch 253/400
Epoch 254/400
Epoch 255/400
Epoch 256/400
Epoch 257/400
Epoch 258/400
Epoch 259/400
Epoch 260/400
Epoch 261/400
Epoch 

Epoch 285/400
Epoch 286/400
Epoch 287/400
Epoch 288/400
Epoch 289/400
Epoch 290/400
Epoch 291/400
Epoch 292/400
Epoch 293/400
Epoch 294/400
Epoch 295/400
Epoch 296/400
Epoch 297/400
Epoch 298/400
Epoch 299/400
Epoch 300/400
Epoch 301/400
Epoch 302/400
Epoch 303/400
Epoch 304/400
Epoch 305/400
Epoch 306/400
Epoch 307/400
Epoch 308/400
Epoch 309/400
Epoch 310/400
Epoch 311/400
Epoch 312/400
Epoch 313/400
Epoch 314/400
Epoch 315/400
Epoch 316/400
Epoch 317/400
Epoch 318/400
Epoch 319/400
Epoch 320/400
Epoch 321/400
Epoch 322/400
Epoch 323/400
Epoch 324/400
Epoch 325/400
Epoch 326/400
Epoch 327/400
Epoch 328/400
Epoch 329/400
Epoch 330/400
Epoch 331/400
Epoch 332/400
Epoch 333/400
Epoch 334/400
Epoch 335/400
Epoch 336/400
Epoch 337/400
Epoch 338/400
Epoch 339/400
Epoch 340/400
Epoch 341/400
Epoch 342/400
Epoch 343/400
Epoch 344/400
Epoch 345/400
Epoch 346/400
Epoch 347/400
Epoch 348/400
Epoch 349/400
Epoch 350/400
Epoch 351/400
Epoch 352/400
Epoch 353/400
Epoch 354/400
Epoch 355/400
Epoch 

Epoch 379/400
Epoch 380/400
Epoch 381/400
Epoch 382/400
Epoch 383/400
Epoch 384/400
Epoch 385/400
Epoch 386/400
Epoch 387/400
Epoch 388/400
Epoch 389/400
Epoch 390/400
Epoch 391/400
Epoch 392/400
Epoch 393/400
Epoch 394/400
Epoch 395/400
Epoch 396/400
Epoch 397/400
Epoch 398/400
Epoch 399/400
Epoch 400/400


In [26]:
mean_absolute_error(Y_train, model.predict(X_train))



0.10905128717603396

In [27]:
mean_absolute_error(Y_test, model.predict(X_test))



0.11408455206590504