# Polynomial Regression Hyperparameter Optimization

Purpose of this notebook to find optimal polynomial hyperparameters for a given data set.  Cannot naively use GridSearchCV because of the non-straightforward process of making polynomial features.  Note: this possibly could be done with a pipeline (to investigate after initial version completed)

In [1]:
### Import Statements
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge as cpu_Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, DotProduct, ConstantKernel, WhiteKernel
from sklearn.metrics import mean_squared_error as mse, mean_absolute_percentage_error as mape, r2_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from tqdm.auto import tqdm, trange
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, PredefinedSplit
import joblib
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from scipy.stats import uniform, expon, norm, randint
import matplotlib.pyplot as plt
import time 
import warnings
from scipy.optimize import minimize
from sklearn.preprocessing import MinMaxScaler
import time
from helpful_functions import make_datasets, make_poly_datasets

In [2]:
spectrum = False # True if using full spectrum, False if only using characteristic energies
noise_level = 30 # Percentage of noise in dataset to consider for optimization

if spectrum:
    num_outputs = 25
else:
    num_outputs = 3

In [3]:
train_df1 = pd.read_hdf('datasets/fuchs_v4-2_seed-2_train_1155231_noise_30_threeEns_campaign1.h5', key = 'df')
train_df1.columns

Index(['Intensity', 'Pulse Duration', 'Target Thickness', 'Spot Size',
       'Focal Distance', 'Contrast', 'Max Proton Energy',
       'Total Proton Energy', 'Avg Proton Energy', 'Max Exact Energy',
       'Total Exact Energy', 'Avg Exact Energy', 'Laser Energy',
       'Conversion Efficiency'],
      dtype='object')

In [4]:
# Sample data
input_list = ['Intensity', 'Target Thickness', 'Focal Distance', 'Contrast'] # independent variables
if spectrum:
    output_list = ["Bin " + str(i) for i in range(25)]
    identifier = 'spectrum'
else: 
    output_list = ['Max Proton Energy', 'Total Proton Energy', 'Avg Proton Energy']
    identifier = 'threeEns'

#train_df1 = pd.read_hdf(f'datasets/fuchs_v4-2_seed-2_train_1155231_noise_{noise_level}_{identifier}_campaign1.h5', key = 'df')#.sample(n=20000,random_state=1)
#train_df2 = pd.read_hdf(f'datasets/fuchs_v4-2_seed-2_train_1274091_noise_{noise_level}_{identifier}_campaign2.h5', key='df')
train_df = pd.read_hdf(f'datasets/fuchs_v5_0_seed-2_train_1525000_noise_10_{identifier}_.h5', key='df').fillna(0).sample(frac=1,random_state=42).reset_index(drop=True)
train_df.loc[:, output_list] = train_df.loc[:, output_list].replace(0, 1e-2)
test_df = pd.read_hdf(f'datasets/fuchs_v5_0_seed-2_test_1000000_noise_0_{identifier}_.h5', key = 'df')
test_df.loc[:, output_list] = test_df.loc[:, output_list].replace(0, 1e-2)
datype = np.float32

# Define X, y as arrays
X = np.array(train_df[input_list],dtype=datype)
y = np.array(train_df[output_list],dtype=datype)
num_pts_tot = X.shape[0]

X_test = np.array(test_df[input_list],dtype=datype)
y_test = np.array(test_df[output_list],dtype=datype)

num_inputs = 4 # Change if not using all input features

In [5]:
# Make datasets.  
X_train, y_train, X_val, y_val, input_transformer, output_transformer = make_poly_datasets(X, y, random_state=42)
y_train_true = output_transformer.inverse_transform(y_train)
y_val_true = output_transformer.inverse_transform(y_val)
X_scaled = input_transformer.transform(X)
y_scaled = output_transformer.transform(y)

In [6]:
# Make a pipeline for the model
poly = PolynomialFeatures()
ridge = cpu_Ridge()

pipe = Pipeline(steps=[('poly',poly),('ridge',ridge)])

In [7]:
# Define the parameter grid
param_grid = {
    'poly__degree': np.arange(1, 8, 1),
    'ridge__alpha': np.logspace(-3,3,7)
}
# Define the search
split_index = np.concatenate([np.full(X_train.shape[0],-1),np.zeros(X_val.shape[0])]) # -1 for training, 0 for validation
predef = PredefinedSplit(test_fold=split_index)
grid = GridSearchCV(pipe, param_grid, cv=predef, n_jobs=-1, verbose=3, scoring='neg_mean_squared_error')

In [8]:
grid.fit(X_scaled, y_scaled)

Fitting 1 folds for each of 49 candidates, totalling 49 fits


[CV 1/1] END poly__degree=1, ridge__alpha=0.01;, score=-0.003 total time=   0.3s
[CV 1/1] END poly__degree=1, ridge__alpha=0.001;, score=-0.003 total time=   0.4s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/1] END .poly__degree=1, ridge__alpha=0.1;, score=-0.003 total time=   0.3s
[CV 1/1] END .poly__degree=1, ridge__alpha=1.0;, score=-0.003 total time=   0.3s
[CV 1/1] END poly__degree=1, ridge__alpha=10.0;, score=-0.003 total time=   0.3s
[CV 1/1] END poly__degree=1, ridge__alpha=100.0;, score=-0.003 total time=   0.3s
[CV 1/1] END poly__degree=1, ridge__alpha=1000.0;, score=-0.003 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/1] END poly__degree=2, ridge__alpha=0.001;, score=-0.002 total time=   0.6s
[CV 1/1] END poly__degree=2, ridge__alpha=0.01;, score=-0.002 total time=   0.5s
[CV 1/1] END .poly__degree=2, ridge__alpha=0.1;, score=-0.002 total time=   3.0s
[CV 1/1] END .poly__degree=2, ridge__alpha=1.0;, score=-0.002 total time=   3.6s
[CV 1/1] END poly__degree=2, ridge__alpha=10.0;, score=-0.002 total time=   3.6s
[CV 1/1] END poly__degree=2, ridge__alpha=1000.0;, score=-0.002 total time=   3.8s
[CV 1/1] END poly__degree=2, ridge__alpha=100.0;, score=-0.002 total time=   3.9s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/1] END poly__degree=3, ridge__alpha=0.001;, score=-0.002 total time=   5.7s
[CV 1/1] END poly__degree=3, ridge__alpha=100.0;, score=-0.002 total time=   5.8s
[CV 1/1] END .poly__degree=3, ridge__alpha=1.0;, score=-0.002 total time=   6.1s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/1] END .poly__degree=3, ridge__alpha=0.1;, score=-0.002 total time=   7.1s
[CV 1/1] END poly__degree=3, ridge__alpha=0.01;, score=-0.002 total time=   7.6s
[CV 1/1] END poly__degree=3, ridge__alpha=10.0;, score=-0.002 total time=   8.1s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/1] END .poly__degree=4, ridge__alpha=0.1;, score=-0.001 total time=  10.2s
[CV 1/1] END poly__degree=4, ridge__alpha=10.0;, score=-0.001 total time=  10.2s
[CV 1/1] END poly__degree=4, ridge__alpha=100.0;, score=-0.002 total time=  10.7s
[CV 1/1] END poly__degree=3, ridge__alpha=1000.0;, score=-0.002 total time=  11.0s
[CV 1/1] END poly__degree=4, ridge__alpha=1000.0;, score=-0.002 total time=  11.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/1] END poly__degree=4, ridge__alpha=0.001;, score=-0.001 total time=  12.8s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/1] END poly__degree=4, ridge__alpha=0.01;, score=-0.001 total time=  14.4s
[CV 1/1] END poly__degree=5, ridge__alpha=1000.0;, score=-0.002 total time=  14.0s
[CV 1/1] END .poly__degree=4, ridge__alpha=1.0;, score=-0.001 total time=  15.1s
[CV 1/1] END poly__degree=5, ridge__alpha=100.0;, score=-0.001 total time=  16.3s
[CV 1/1] END .poly__degree=5, ridge__alpha=1.0;, score=-0.001 total time=  16.8s
[CV 1/1] END poly__degree=5, ridge__alpha=10.0;, score=-0.001 total time=  17.2s
[CV 1/1] END poly__degree=6, ridge__alpha=10.0;, score=-0.001 total time=  18.8s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/1] END .poly__degree=6, ridge__alpha=1.0;, score=-0.001 total time=  20.4s
[CV 1/1] END poly__degree=6, ridge__alpha=100.0;, score=-0.001 total time=  23.3s
[CV 1/1] END poly__degree=6, ridge__alpha=1000.0;, score=-0.002 total time=  24.7s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/1] END poly__degree=5, ridge__alpha=0.001;, score=-0.001 total time=  26.8s
[CV 1/1] END .poly__degree=7, ridge__alpha=1.0;, score=-0.001 total time=  26.9s
[CV 1/1] END poly__degree=7, ridge__alpha=100.0;, score=-0.001 total time=  27.5s
[CV 1/1] END poly__degree=5, ridge__alpha=0.01;, score=-0.001 total time=  27.9s
[CV 1/1] END poly__degree=7, ridge__alpha=1000.0;, score=-0.001 total time=  28.3s
[CV 1/1] END poly__degree=7, ridge__alpha=10.0;, score=-0.001 total time=  28.6s
[CV 1/1] END .poly__degree=5, ridge__alpha=0.1;, score=-0.001 total time=  30.1s
[CV 1/1] END poly__degree=6, ridge__alpha=0.001;, score=-0.001 total time=  36.0s
[CV 1/1] END poly__degree=6, ridge__alpha=0.01;, score=-0.001 total time=  38.2s
[CV 1/1] END .poly__degree=6, ridge__alpha=0.1;, score=-0.001 total time=  39.3s
[CV 1/1] END poly__degree=7, ridge__alpha=0.01;, score=-0.001 total time=  52.0s
[CV 1/1] END poly__degree=7, ridge__alpha=0.001;, score=-0.001 total time=  56.0s
[CV 1/1] END .poly__de

In [9]:
# Find the best parameters
grid.best_params_

{'poly__degree': 7, 'ridge__alpha': 0.001}

In [10]:
grid.cv_results_

{'mean_fit_time': array([ 0.35374451,  0.23573589,  0.25132608,  0.2781148 ,  0.24955153,
         0.27973723,  0.28906965,  0.50042367,  0.51636934,  2.68968105,
         3.33805633,  3.27956438,  3.05473948,  3.51838374,  4.98251963,
         7.0733912 ,  6.72105861,  5.57529092,  7.68856931,  5.36860824,
        10.2215662 , 12.16871953, 13.27751422,  9.5955627 , 13.98556757,
         9.61524034,  9.99576211, 10.89238381, 26.29158378, 27.54195929,
        29.81949902, 16.095186  , 16.60161138, 15.50256872, 13.2837379 ,
        35.72878599, 37.84492207, 38.95056295, 19.60767961, 18.2025578 ,
        22.69950223, 24.03485823, 55.57237959, 51.59804845, 56.45910692,
        26.05283952, 28.21461415, 26.81028152, 27.86118627]),
 'std_fit_time': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'mean_score_time': array(

In [11]:
best_model = pipe.set_params(**grid.best_params_).fit(X_train, y_train)

In [12]:
y_val_pred = output_transformer.inverse_transform(best_model.predict(X_val))
val_error = mape(y_val_true, y_val_pred)*100
print(f'Validation Error: {val_error:.2f}%')

Validation Error: 3057162600.00%


In [13]:
X_test_scaled = input_transformer.transform(X_test)
y_test_pred = output_transformer.inverse_transform(best_model.predict(X_test_scaled))
try:
    test_error = mape(y_test, y_test_pred)*100
    print(f'Test Error: {test_error:.2f}%')
except:
    print("Model failed to generalize.  No test error calculated.")

Test Error: 601500750.00%
