# Polynomial Regression Hyperparameter Optimization

Purpose of this notebook to find optimal polynomial hyperparameters for a given data set.  Cannot naively use GridSearchCV because of the non-straightforward process of making polynomial features.  Note: this possibly could be done with a pipeline (to investigate after initial version completed)

In [1]:
### Import Statements
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge as cpu_Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, DotProduct, ConstantKernel, WhiteKernel
from sklearn.metrics import mean_squared_error as mse, mean_absolute_percentage_error as mape, r2_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from tqdm.auto import tqdm, trange
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, PredefinedSplit
import joblib
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from scipy.stats import uniform, expon, norm, randint
import matplotlib.pyplot as plt
import time 
import warnings
from scipy.optimize import minimize
from sklearn.preprocessing import MinMaxScaler
import time
import os
import sys
sys.path.append('../')
from helpful_functions import make_poly_datasets

In [2]:
spectrum = False # True if using full spectrum, False if only using characteristic energies
noise = 10 # Percentage of noise in dataset to consider for optimization

if spectrum:
    num_outputs = 25
else:
    num_outputs = 3

In [3]:
# Sample data
input_list = ['Intensity', 'Target Thickness', 'Focal Distance', 'Contrast'] # independent variables
if spectrum:
    output_list = ["Bin " + str(i) for i in range(25)]
    identifier = 'spectrum'
else: 
    output_list = ['Max Proton Energy', 'Total Proton Energy', 'Avg Proton Energy']
    identifier = 'threeEns'

train_df = pd.read_hdf(f'../datasets/fuchs_v5_0_seed-2_train_1525000_noise_{noise}_{identifier}_.h5', key='df').fillna(0).sample(frac=1,random_state=42).reset_index(drop=True)
datype = np.float32

# Define X, y as arrays
X = np.array(train_df[input_list],dtype=datype)
y = np.array(train_df[output_list],dtype=datype)
num_pts_tot = X.shape[0]

num_inputs = 4 # Change if not using all input features

In [4]:
# Make datasets.  
X_train, y_train, X_val, y_val, input_transformer, output_transformer = make_poly_datasets(X, y, random_state=42)
y_train_true = output_transformer.inverse_transform(y_train)
y_val_true = output_transformer.inverse_transform(y_val)
X_scaled = input_transformer.transform(X)
y_scaled = output_transformer.transform(y)

In [5]:
# Make a pipeline for the model
poly = PolynomialFeatures()
ridge = cpu_Ridge()

pipe = Pipeline(steps=[('poly',poly),('ridge',ridge)])

In [6]:
# Define the parameter grid
degree_end = 9
param_grid = {
    'poly__degree': np.arange(1, degree_end + 1, 1),
    'ridge__alpha': np.logspace(-3, 3, 7)
}
# Define the search
grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1, verbose=3, scoring='neg_mean_squared_error')

In [7]:
grid.fit(X_scaled, y_scaled)

Fitting 3 folds for each of 63 candidates, totalling 189 fits


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

[CV 3/3] END poly__degree=2, ridge__alpha=1000.0;, score=-0.010 total time=   0.7s
[CV 3/3] END .poly__degree=5, ridge__alpha=1.0;, score=-0.003 total time=   3.9s
[CV 1/3] END .poly__degree=7, ridge__alpha=1.0;, score=-0.002 total time=   9.6s
[CV 3/3] END poly__degree=8, ridge__alpha=100.0;, score=-0.003 total time=  22.2s
[CV 3/3] END poly__degree=2, ridge__alpha=10.0;, score=-0.010 total time=   0.7s
[CV 2/3] END poly__degree=5, ridge__alpha=0.09999999999999999;, score=-0.002 total time=   3.9s
[CV 1/3] END poly__degree=7, ridge__alpha=10.0;, score=-0.002 total time=   9.6s
[CV 1/3] END poly__degree=8, ridge__alpha=1000.0;, score=-0.004 total time=  25.0s
[CV 1/3] END poly__degree=1, ridge__alpha=10.0;, score=-0.014 total time=   0.3s
[CV 2/3] END poly__degree=3, ridge__alpha=1000.0;, score=-0.008 total time=   1.2s
[CV 3/3] END poly__degree=5, ridge__alpha=1000.0;, score=-0.006 total time=   3.9s
[CV 2/3] END poly__degree=7, ridge__alpha=1000.0;, score=-0.005 total time=   9.8s
[C

In [8]:
# Find the best parameters
print(grid.best_params_)
print(grid.best_score_)

{'poly__degree': 9, 'ridge__alpha': 0.001}
-0.0008072858909144998


In [9]:
grid.cv_results_

{'mean_fit_time': array([  0.34500416,   0.18603086,   0.2176861 ,   0.25321738,
          0.25424083,   0.29286544,   0.28862945,   0.53682812,
          0.53215973,   0.57797519,   0.56081796,   0.5806071 ,
          0.57703996,   0.60698994,   1.00959667,   1.09660474,
          0.6567119 ,   0.73338763,   0.84495195,   0.861492  ,
          0.87953766,   1.74310112,   1.73176066,   1.80083394,
          1.7941428 ,   1.81133159,   1.76945933,   1.82847198,
         15.39944053,  14.65946547,   7.07929254,   2.95683773,
          2.97231515,   2.94250035,   2.89496644,  28.01772698,
         27.60530376,  27.57658005,   4.53381546,   4.59495719,
          4.51557279,   4.57749867,  52.9735016 ,  41.47192494,
         56.71148674,   7.33449062,   7.15356572,   7.17602515,
          7.48599331, 101.39387306, 101.52037112, 107.5716176 ,
         17.05788207,  39.78093521,  22.56624953,  21.15797369,
        150.78301525, 160.38498672, 146.9050614 ,  31.57900858,
         30.70994163,  

In [12]:
df = pd.DataFrame(grid.cv_results_)
if not os.path.exists(f'results/ridge_noise_{noise}'):
    os.mkdir(f'results/ridge_noise_{noise}')
df.to_csv(f'results/ridge_noise_{noise}/grid_search_results.csv')

In [10]:
best_model = pipe.set_params(**grid.best_params_).fit(X_train, y_train)

In [11]:
y_val_pred = output_transformer.inverse_transform(best_model.predict(X_val))
val_error = mape(y_val_true, y_val_pred, multioutput='raw_values')*100
rev_val_error = mape(y_val_pred, y_val_true, multioutput='raw_values')*100
print(f'Validation Error: {val_error}%')
print(f'Rev Validation Error: {rev_val_error}%')
# Seems Least Accurate with Total Proton Energy

Validation Error: [15.978805 29.190075 14.10683 ]%
Rev Validation Error: [15.570532 27.076366 13.861969]%
