# 2.03 - Non-Linear Model Parameter Search
This notebook performs a parameter search for nonlinear models to predict the relationship between the COB mean and BG mean values at lag 0. The goal is to find the best hyperparameters for models such as Support Vector Regression (SVR), Decision Tree Regressor, and Random Forest Regressor.

In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

from src.reliationship_modelling import AnalyseRelationships
from src.config import PROCESSED_DATA_DIR

[32m2025-08-16 15:31:54.836[0m | [1mINFO    [0m | [36msrc.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\ross\OneDrive\Documents\Masters\Project\masters_project[0m


In [2]:
with open(PROCESSED_DATA_DIR / 'cust_analyser.pkl', 'rb') as file:
    analyser = pickle.load(file)

df_to_compare = analyser.return_dataset_with_clusters()
variables = ['cob mean', 'iob mean', 'bg mean']
scaled_cols = ['cob_mean_scaled', 'iob_mean_scaled', 'bg_mean_scaled']
scaler = StandardScaler()
df_to_compare[scaled_cols] = scaler.fit_transform(df_to_compare[variables])
rel = AnalyseRelationships(
    df_to_compare.rename(columns={'cluster_label': 'cluster'}))

X = df_to_compare[['cob_mean_scaled']].values
y = df_to_compare['bg_mean_scaled'].to_list()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def find_scores(search, X_train, y_train, X_test, y_test):
    search.fit(X_train, y_train)
    best_params = search.best_params_
    best_score = search.best_score_ # This is the R^2 score from cross-validation
    print("Best Parameters:", best_params)
    print("Best R^2 Score from Cross-Validation:", best_score)

    best_regressor = search.best_estimator_
    y_pred = best_regressor.predict(X_test)
    final_mse = mean_squared_error(y_test, y_pred)
    print("Final Mean Squared Error on Test Data:", final_mse)

In [3]:
svr = SVR()
param_distributions = {
    'kernel': ['rbf'],
    'C': np.logspace(-3, 3, 7),  # e.g., 0.001, 0.01, 0.1, 1, 10, 100, 1000
    'gamma': np.logspace(-3, 3, 7),
    'epsilon': [0.1, 0.2, 0.5]
}

search = RandomizedSearchCV(svr, param_distributions, n_iter=40, cv=5, verbose=3, random_state=42, n_jobs=-1)
find_scores(search, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best Parameters: {'kernel': 'rbf', 'gamma': np.float64(1.0), 'epsilon': 0.5, 'C': np.float64(1000.0)}
Best R^2 Score from Cross-Validation: -0.011772264440382285
Final Mean Squared Error on Test Data: 1.005549743254307
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best Parameters: {'kernel': 'rbf', 'gamma': np.float64(1.0), 'epsilon': 0.5, 'C': np.float64(1000.0)}
Best R^2 Score from Cross-Validation: -0.011772264440382285
Final Mean Squared Error on Test Data: 1.005549743254307
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best Parameters: {'kernel': 'rbf', 'gamma': np.float64(1.0), 'epsilon': 0.5, 'C': np.float64(1000.0)}
Best R^2 Score from Cross-Validation: -0.011772264440382285
Final Mean Squared Error on Test Data: 1.005549743254307


In [4]:
dt_regressor = DecisionTreeRegressor(random_state=42)
param_distributions = {
    'max_depth': [3, 5, 7, 10, None], # None means no limit
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [1.0, 'sqrt', 'log2', 0.5]
}

search = RandomizedSearchCV(dt_regressor, param_distributions, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1)
find_scores(search, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 5}
Best R^2 Score from Cross-Validation: 0.005487279853269267
Final Mean Squared Error on Test Data: 0.9900844009955123


In [5]:
rf_regressor = RandomForestRegressor(random_state=42)
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None] # None means all features
}

search = RandomizedSearchCV(rf_regressor, param_distributions, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1)
find_scores(search, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5}
Best R^2 Score from Cross-Validation: 0.005932272814757389
Final Mean Squared Error on Test Data: 0.9900803899204302
