In [32]:
import pickle
import numpy as np
import pandas as pd

from ngboost import NGBRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr

In [51]:
# load in datasets

param_names = ['src_flow', 'dst_flow', 'airgap', 'post_airgap', 'extra_volume', 'src_equib', 'dst_equib']
obj_names = ['error', 'stdev']

col_names = param_names+obj_names

df_thf_100 = pd.read_csv(
    '../../../olympus/src/olympus/datasets/dataset_liquid_thf_100/data.csv', names=col_names
)

df_thf_500 = pd.read_csv(
    '../../../olympus/src/olympus/datasets/dataset_liquid_thf_500/data.csv', names=col_names
)

df_hep_100 = pd.read_csv(
    '../../../olympus/src/olympus/datasets/dataset_liquid_hep_100/data.csv', names=col_names
)

df_ace_100 = pd.read_csv(
    '../../../olympus/src/olympus/datasets/dataset_liquid_ace_100/data.csv', names=col_names
)

print(f'thf 100 : {df_thf_100.shape[0]}')
print(f'thf 500 : {df_thf_500.shape[0]}')
print(f'hep 100 : {df_hep_100.shape[0]}')
print(f'ace 100 : {df_ace_100.shape[0]}')

thf 100 : 39
thf 500 : 67
hep 100 : 69
ace 100 : 48


In [52]:
X = df_hep_100[param_names].values
y = df_hep_100[obj_names].values

train_frac = 0.75
num_train = int(train_frac*y.shape[0])
indices = np.arange(y.shape[0])
np.random.seed(100701)
np.random.shuffle(indices)

train_indices = indices[:num_train]
test_indices = indices[num_train:]

train_X = X[train_indices, :]
train_y = y[train_indices, :]

test_X = X[test_indices, :]
test_y = y[test_indices, :]


print(X.shape, y.shape)
print(train_X.shape, train_y.shape)
print(test_X.shape, test_y.shape)

(69, 7) (69, 2)
(51, 7) (51, 2)
(18, 7) (18, 2)


In [53]:
# ngboost hyperparams
n_estimators=1500
lr=0.001
tol=1e-4
patience=100

model = [ 
    NGBRegressor(
        n_estimators=n_estimators,
        learning_rate=lr,
        tol=tol
    ) for _ in range(y.shape[1])
]
    

In [54]:
for ix in range(len(model)):
    model[ix].verbose = True
    model[ix].fit(train_X, train_y[:, ix])
    

[iter 0] loss=-1.7183 val_loss=0.0000 scale=2.0000 norm=1.6187
[iter 100] loss=-2.1501 val_loss=0.0000 scale=2.0000 norm=0.8853
[iter 200] loss=-2.2565 val_loss=0.0000 scale=2.0000 norm=0.8412
[iter 300] loss=-2.3501 val_loss=0.0000 scale=2.0000 norm=0.8360
[iter 400] loss=-2.4393 val_loss=0.0000 scale=2.0000 norm=0.8404
[iter 500] loss=-2.5262 val_loss=0.0000 scale=2.0000 norm=0.8446
[iter 600] loss=-2.6115 val_loss=0.0000 scale=2.0000 norm=0.8459
[iter 700] loss=-2.6956 val_loss=0.0000 scale=2.0000 norm=0.8448
[iter 800] loss=-2.7782 val_loss=0.0000 scale=2.0000 norm=0.8415
[iter 900] loss=-2.8596 val_loss=0.0000 scale=2.0000 norm=0.8365
[iter 1000] loss=-2.9414 val_loss=0.0000 scale=2.0000 norm=0.8330
[iter 1100] loss=-3.0217 val_loss=0.0000 scale=2.0000 norm=0.8276
[iter 1200] loss=-3.1008 val_loss=0.0000 scale=2.0000 norm=0.8210
[iter 1300] loss=-3.1787 val_loss=0.0000 scale=2.0000 norm=0.8133
[iter 1400] loss=-3.2577 val_loss=0.0000 scale=2.0000 norm=0.8092
[iter 0] loss=-3.6436 

In [55]:
y_train_mu, y_train_std = [], []
y_test_mu, y_test_std = [], []
for m in model:
    y_dists = m.pred_dist(test_X)
    y_test_mu.append(y_dists.loc)
    y_test_std.append(np.sqrt(y_dists.var))
    
    y_dists = m.pred_dist(train_X)
    y_train_mu.append(y_dists.loc)
    y_train_std.append(np.sqrt(y_dists.var))
    
y_train_mu = np.stack(y_train_mu, axis=0).T
y_train_std = np.stack(y_train_std, axis=0).T

y_test_mu = np.stack(y_test_mu, axis=0).T
y_test_std = np.stack(y_test_std, axis=0).T

print(y_train_mu.shape, y_train_std.shape)

(51, 2) (51, 2)


In [56]:
r2_e_train = r2_score(train_y[:, 0], y_train_mu[:, 0])
r2_s_train = r2_score(train_y[:, 1], y_train_mu[:, 1])

r2_e_test = r2_score(test_y[:, 0], y_test_mu[:, 0])
r2_s_test = r2_score(test_y[:, 1], y_test_mu[:, 1])

print(r2_e_train, r2_s_train)
print(r2_e_test, r2_s_test)

0.9778807752734834 0.7978432217888721
-0.06314631302617757 -0.08000482473061843
