In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

version = "v5"

X = np.load(f"../processed-data/{version}/X.npy")
y = np.load(f"../processed-data/{version}/y.npy")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, PoissonRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error


import warnings
warnings.filterwarnings('ignore')

In [3]:
import importlib

CLASS_MODULES = {
    "LinearRegression": "sklearn.linear_model",
    "SVR": "sklearn.svm",
    "DecisionTreeRegressor": "sklearn.tree",
    "Lasso": "sklearn.linear_model",
    "Ridge": "sklearn.linear_model",
    "KNeighborsRegressor": "sklearn.neighbors",
    "RandomForestRegressor": "sklearn.ensemble",
    "PoissonRegressor": "sklearn.linear_model",
}

def get_model(model_name):
    try:
        module_name = CLASS_MODULES.get(model_name)
        module = importlib.import_module(module_name)
        clf_class = getattr(module, model_name)
        clf_instance = clf_class()

        return clf_instance
    except ImportError as e:
        print(f"Error importing module: {e}")
    except AttributeError as e:
        print(f"Error accessing class {model_name}: {e}")

In [4]:
param_grid = {
    "LinearRegression" : {
        "n_jobs": [-1, ],
    },
    "Lasso" : {
        "alpha" : [0.1, 0.5, 1.0, 5.0, 10.0],
        "max_iter" : [2500, ],
    },
    "Ridge" : {
        "alpha" : [0.1, 0.5, 1.0, 5.0, 10.0],
    },
    "DecisionTreeRegressor" : {
        "max_depth" : [None, 10, 20, 30, 40, 50],
        "min_samples_split" : [2, 5, 10, 15, 20],
        "min_samples_leaf" : [1, 2, 4, 6, 8],
    },
    "SVR" : {
        "kernel" : ["linear", "rbf", "poly"],
        "C" : [0.1, 0.5, 1.0, 5.0, 10.0],
        "epsilon" : [0.1, 0.2, 0.5],
    },
    "KNeighborsRegressor" : {
        "n_neighbors" : [3, 5, 7, 10, 15],
        "weights" : ["uniform", "distance"],
        "n_jobs": [-1, ],
    },
    "RandomForestRegressor" : {
        "n_estimators" : [100, 200, 300],
        "max_depth" : [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "n_jobs" : [-1, ],
    },
    "PoissonRegressor" : {
        "alpha" : [0.01, 0.1, 0.5, 1.0, 2.0],
        "max_iter" : [100, 200, 300],
    },
}

In [5]:
best_models = {}

for model_name, params in param_grid.items():

    print(f"\n{model_name}:")
    model = get_model(model_name)
    grid_search = GridSearchCV(model, params, cv=5, scoring="neg_mean_squared_error")
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_params = best_model.get_params()

    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Best Mean Squared Error: {mse}')
    print(f'Best Parameters: {best_params}\n\n')

    best_models[model_name] = {
        "model" : best_model,
        "params" : best_params,
        "loss_value" : mse
    }


LinearRegression:
Best Mean Squared Error: 204106604653.99725
Best Parameters: {'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': False}



Lasso:
Best Mean Squared Error: 203995596263.83337
Best Parameters: {'alpha': 10.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': 2500, 'positive': False, 'precompute': False, 'random_state': None, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False}



Ridge:
Best Mean Squared Error: 202147632833.12433
Best Parameters: {'alpha': 5.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001}



DecisionTreeRegressor:
Best Mean Squared Error: 115327822393.25232
Best Parameters: {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 6, 'min_samples_split': 20, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter'

In [6]:
from sklearn.metrics import r2_score

for best_model in best_models:
    model = best_models[best_model]["model"]
    pred = model.predict(X_test)
    print(f"{best_model} - {r2_score(y_test, pred)}")

LinearRegression - 0.5095483851873996
Lasso - 0.5098151293444833
Ridge - 0.5142556355703465
DecisionTreeRegressor - 0.7228765976413314
SVR - 0.029711135889648577
KNeighborsRegressor - 0.7663791870506564
RandomForestRegressor - 0.8006063259208326
PoissonRegressor - 0.6433361330993965


In [7]:
import joblib

for best_model in best_models:
    model = best_models[best_model]["model"]
    joblib.dump(model, f"../joblib-files/models/{version}/{best_model}.pkl")