In [1]:
import git
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

In [2]:
# Testing whether using data_consol.csv helps anything. If so, probably indicates an error in reading in or joining the separate CSVs before
repo = git.Repo('.', search_parent_directories = True)
root = repo.working_tree_dir

data_consol = pd.read_csv(root + '\\data\\data_consol.csv')

In [3]:
X = data_consol.filter(regex="^[0-9]+$")
bact = data_consol['pcr_bact_log']

# Note: do NOT scale X and y before splitting, since that is a data leak. Instead, use the pipeline to scale both Xs, and separately scale the y for custom scoring like RMSE.
X_train, X_test, bact_train_unscaled, bact_test_unscaled = train_test_split(X.to_numpy(), bact.to_numpy(), train_size=0.8, random_state=0)

# Reshaping necessary for the y scaling step
bact_train_unscaled = bact_train_unscaled.reshape(-1,1)
bact_test_unscaled = bact_test_unscaled.reshape(-1,1)

bact_scaler = StandardScaler()
bact_train = bact_scaler.fit_transform(bact_train_unscaled)
bact_test = bact_scaler.transform(bact_test_unscaled)

# 10-fold CV; random state 0
cv_10_0 = KFold(n_splits=10, shuffle=True, random_state=0)

In [4]:
pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("elastic_net", ElasticNet(fit_intercept=False, warm_start=True, random_state=0, selection='random', max_iter=4000))
    ],
    memory = root+'\\cache',
    verbose=True
)

REGULARIZATION = np.logspace(-5, 0, 8)
MIXTURE = np.linspace(0.001, 1, 8)
PARAM_GRID = [
    {
        "elastic_net__alpha": REGULARIZATION,
        "elastic_net__l1_ratio": MIXTURE
    }
]

grid = GridSearchCV(estimator=pipe, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_10_0, error_score='raise')
grid.fit(X_train, bact_train)

[Pipeline] ....... (step 2 of 2) Processing elastic_net, total=  10.0s


In [5]:
print('Training RMSE:', round(abs(grid.score(X_train, bact_train)), 3))
print('Testing RMSE:', round(abs(grid.score(X_test, bact_test)), 3))

# Inverse-transforming the preds to get back to original scale.
# Used for comparison with R results
preds_unscaled = bact_scaler.inverse_transform(grid.predict(X_test).reshape(-1,1))
print('Testing RMSE, unscaled:', round(root_mean_squared_error(preds_unscaled, bact_test_unscaled), 3))

Training RMSE: 0.587
Testing RMSE: 0.766
Testing RMSE, unscaled: 0.338


*Testing models' results:*

For elastic_net random_state=0, training RMSE is 0.587, and testing RMSE is 0.766.

After removing the scaler, training RMSE is 0.491, and testing RMSE is 0.629

No scaler; optimizing R2: training RMSE is 0.236, testing RMSE is 0.935

No scaler; 10-fold CV: training RMSE 0.491, testing RMSE 0.629

With scaler; 10-fold CV; unscaling preds: training RMSE 0.587, testing RMSE 0.338 (on par with R results)

*Final model's results:*

Training RMSE: 0.587

Testing RMSE: 0.766

Testing RMSE, unscaled: 0.338 (on par with R results)