In [42]:
import numpy as np
from sklearn.model_selection import train_test_split

version = "v5"

X = np.load(f"../processed-data/{version}/X.npy")
y = np.load(f"../processed-data/{version}/y.npy")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, PoissonRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score


import warnings
warnings.filterwarnings('ignore')

In [44]:
import importlib

CLASS_MODULES = {
    "LinearRegression": "sklearn.linear_model",
    "SVR": "sklearn.svm",
    "DecisionTreeRegressor": "sklearn.tree",
    "Lasso": "sklearn.linear_model",
    "Ridge": "sklearn.linear_model",
    "KNeighborsRegressor": "sklearn.neighbors",
    "RandomForestRegressor": "sklearn.ensemble",
    "PoissonRegressor": "sklearn.linear_model",
}

def get_model(model_name):
    try:
        module_name = CLASS_MODULES.get(model_name)
        module = importlib.import_module(module_name)
        clf_class = getattr(module, model_name)
        clf_instance = clf_class()

        return clf_instance
    except ImportError as e:
        print(f"Error importing module: {e}")
    except AttributeError as e:
        print(f"Error accessing class {model_name}: {e}")

In [45]:
param_grid = {
    "LinearRegression" : {
        "n_jobs": [-1, ],
    },
    "Lasso" : {
        "alpha" : [0.1, 0.5, 1.0, 5.0, 10.0],
        "max_iter" : [2500, ],
    },
    "Ridge" : {
        "alpha" : [0.1, 0.5, 1.0, 5.0, 10.0],
    },
    "DecisionTreeRegressor" : {
        "max_depth" : [None, 10, 20, 30, 40, 50],
        "min_samples_split" : [2, 5, 10, 15, 20],
        "min_samples_leaf" : [1, 2, 4, 6, 8],
    },
    "SVR" : {
        "kernel" : ["linear", "rbf", "poly"],
        "C" : [0.1, 0.5, 1.0, 5.0, 10.0],
        "epsilon" : [0.1, 0.2, 0.5],
    },
    "KNeighborsRegressor" : {
        "n_neighbors" : [3, 5, 7, 10, 15],
        "weights" : ["uniform", "distance"],
        "n_jobs": [-1, ],
    },
    "RandomForestRegressor" : {
        "n_estimators" : [100, 200, 300],
        "max_depth" : [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "n_jobs" : [-1, ],
    },
    "PoissonRegressor" : {
        "alpha" : [0.01, 0.1, 0.5, 1.0, 2.0],
        "max_iter" : [100, 200, 300],
    },
}

In [46]:
best_models = {}

for model_name, params in param_grid.items():

    print(f"\n{model_name}:")
    model = get_model(model_name)
    grid_search = GridSearchCV(model, params, cv=5, scoring="neg_root_mean_squared_error")
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_params = best_model.get_params()

    y_pred = best_model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Best Root Mean Squared Error: {rmse}')
    print(f'Best Mean Absolute Error: {mae}')
    print(f'Best R2 Error: {r2}')
    print(f'Best Parameters: {best_params}\n')

    best_models[model_name] = {
        "model" : best_model,
        "params" : best_params,
        "loss_value" : mse
    }


LinearRegression:
Best Root Mean Squared Error: 451781.58954742417
Best Mean Absolute Error: 310145.08905050496
Best R2 Error: 0.5095483851873996
Best Parameters: {'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': False}


Lasso:
Best Root Mean Squared Error: 451658.7165812627
Best Mean Absolute Error: 310135.048144976
Best R2 Error: 0.5098151293444833
Best Parameters: {'alpha': 10.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': 2500, 'positive': False, 'precompute': False, 'random_state': None, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False}


Ridge:
Best Root Mean Squared Error: 449564.541668025
Best Mean Absolute Error: 309266.94501138886
Best R2 Error: 0.5143502041441828
Best Parameters: {'alpha': 10.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001}


DecisionTreeRegressor:
Best Root Mean Squared Error: 339566.55877709703
Best Mean Absolute Error: 201478.97708125936


In [31]:
import joblib

for best_model in best_models:
    model = best_models[best_model]["model"]
    joblib.dump(model, f"../joblib-files/models/{version}/{best_model}.pkl")