## Evaluating the predictive accuracy of physics emulators without hyperparameter fine tuning

In [1]:
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from duqling_interface import DuqlingInterface

from plot_performance import heatmap

In [2]:
from sklearn.cross_decomposition      import PLSRegression
from sklearn.linear_model             import LassoLars, ElasticNet
from sklearn.ensemble                 import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.svm                      import SVR
from sklearn.gaussian_process         import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel, WhiteKernel
from sklearn.pipeline                 import Pipeline
from sklearn.preprocessing            import StandardScaler

models = [
    PLSRegression(n_components=1),
    LassoLars(random_state=42),
    ElasticNet(max_iter=10_000, random_state=42),
    ExtraTreesRegressor(random_state=42),
    GradientBoostingRegressor(random_state=42),
    SVR(kernel='rbf'),
    Pipeline([
        ('scale', StandardScaler()),
        ('gpr',   GaussianProcessRegressor(
            kernel = ConstantKernel(1.0, (1e-2, 1e2))
                    * Matern()
                    + WhiteKernel(1e-3, (1e-6, 1e1)),
            n_restarts_optimizer = 3,
            random_state = 42,
            normalize_y = True
        ))
    ])
]

In [4]:
duq = DuqlingInterface()

univariate_funcs = duq.list_functions(response_type='uni').fname

As Kelin pointed out, latin hypercube sampling will produce distinct distributions of data based on the random seed.

In [None]:
duqling_funcs = duq.list_functions(response_type='uni').fname
model_names = [model.__class__.__name__ for model in models]

df_mse = pd.DataFrame(columns=duqling_funcs, index=model_names)

In [None]:
for fname in tqdm(duqling_funcs):
    if fname == 'cube3_rotate':
        continue
    X_trn, y_trn = duq.generate_data(fname, 1000, seed=41)
    X_tst, y_tst = duq.generate_data(fname, 1000, seed=42)
    for model in models:
        model.fit(X_trn, y_trn)
        y_pred = model.predict(X_tst)
        df_mse.at[model.__class__.__name__, fname] = mean_squared_error(y_pred, y_tst)

  0%|          | 0/19 [00:00<?, ?it/s]

  model = cd_fast.enet_coordinate_descent(
ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
  model = cd_fast.enet_coordinate_descent(


In [None]:
df_rmse = df_mse.map(np.sqrt)


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



In [1]:
fig1 = heatmap(df_rmse, "MSE")
# fig2 = heatmap((df_rmse/df_std).drop(cols_to_drop, axis=1), "Test RMSE / \u03C3")
# fig3 = heatmap((df_rmse/df_std)[(df_rmse/df_std).drop(cols_to_drop, axis=1)>1].drop(cols_to_drop, axis=1), "(Test RMSE / \u03C3) > 1")

NameError: name 'heatmap' is not defined