We perform cross-validation for different models for regression and save the results.

# Imports

In [4]:
# Standard imports

import pandas as pd
import numpy as np

In [5]:
# Sklearn imports

from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# Load data

In [None]:
from src.utils import get_data

data = get_data()
X_train, y_train = data[0], data[2]

# Define models

In [10]:
lin_reg = ("lin_reg", LinearRegression(), {})
ridge = ("ridge", Ridge(), {"ridge__alpha": np.logspace(-3, 3, 7)})
lasso = ("lasso", Lasso(), {"lasso__alpha": np.logspace(-3, 3, 7)})
knn = ("knn", KNeighborsRegressor(), {"knn__n_neighbors": [1, 3, 5, 10, 20, 50]})
random_forest = (
    "random_forest",
    RandomForestRegressor(random_state=314),
    {
        "random_forest__n_estimators": [1, 10, 100, 500],
        "random_forest__max_depth": [1, 2, 3, 4, 5],
    },
)
gb_reg = (
    "gb_reg",
    GradientBoostingRegressor(random_state=314),
    {"gb_reg__n_estimators": [1, 10, 100, 500], "gb_reg__max_depth": [1, 2, 3, 4, 5]},
)

models = [lin_reg, ridge, lasso, knn, random_forest, gb_reg]

# Apply CV

In [None]:
from src.utils import grid_search_best

results_dic = {}

for model in models:
    """
    Performs grid search for the given model and parameters, with standard scaling.
    """
    results_dic[model[0]] = grid_search_best(X_train, y_train, *model)

results = pd.DataFrame(results_dic)

In [13]:
results

Unnamed: 0,lin_reg,ridge,lasso,knn,random_forest,gb_reg
0,{},{'ridge__alpha': 10.0},{'lasso__alpha': 0.1},{'knn__n_neighbors': 10},"{'random_forest__max_depth': 3, 'random_forest...","{'gb_reg__max_depth': 1, 'gb_reg__n_estimators..."
1,0.579475,0.579531,0.579731,0.517805,0.569466,0.579587


We see that the best models, and essentially equivalent, are linear regression (with or without regularization) and gradient boosting.

Let us these models try with polynomial features.

In [None]:
# With polynomial features, degree up to 3.

models_poly = [lin_reg, ridge, lasso, gb_reg]
results_dic_poly = {}

for model in models_poly:
    """
    Performs grid search for the given model and parameters, with standard scaling.
    """
    results_dic_poly[model[0]] = grid_search_best(
        X_train, y_train, *model, polyfeat=True
    )

results_poly = pd.DataFrame(results_dic_poly)

In [16]:
results_poly

Unnamed: 0,lin_reg,ridge,lasso,gb_reg
0,{'poly_feat__degree': 1},"{'poly_feat__degree': 1, 'ridge__alpha': 10.0}","{'lasso__alpha': 0.1, 'poly_feat__degree': 1}","{'gb_reg__max_depth': 1, 'gb_reg__n_estimators..."
1,0.579475,0.579531,0.579731,0.582297


There is a small improvement for GB tree, but nothing  We see not much difference between linear regression, linear regression with regularization, and gradient boosted trees.

In [None]:
# Save results

results.to_csv("../results/regression/CV_results.csv")
results_poly.to_csv("../results/regression/CV_poly_results.csv")