In [2]:
import git
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn_genetic import GAFeatureSelectionCV
from sklearn_genetic.space import Categorical, Integer, Continuous

In [3]:
# Testing whether using data_consol.csv helps anything. If so, probably indicates an error in reading in or joining the separate CSVs before
repo = git.Repo('.', search_parent_directories = True)
root = repo.working_tree_dir

data_consol = pd.read_csv(root + '//data/data_consol.csv')

In [4]:
X = data_consol.filter(regex="^[0-9]+$")
bact = data_consol['pcr_bact_log']

# Note: do NOT scale X and y before splitting, since that is a data leak. Instead, use the pipeline to scale both Xs, and separately scale the y for custom scoring like RMSE.
X_train, X_test, bact_train_unscaled, bact_test_unscaled = train_test_split(X.to_numpy(), bact.to_numpy(), train_size=0.8, random_state=0)

# Reshaping necessary for the y scaling step
bact_train_unscaled = bact_train_unscaled.reshape(-1,1)
bact_test_unscaled = bact_test_unscaled.reshape(-1,1)

bact_scaler = StandardScaler()
bact_train = bact_scaler.fit_transform(bact_train_unscaled)
bact_test = bact_scaler.transform(bact_test_unscaled)

# 10-fold CV; random state 0
cv_10_0 = KFold(n_splits=10, shuffle=True, random_state=0)

In [5]:
model = ElasticNet(fit_intercept=False, warm_start=True, random_state=0, selection='random', max_iter=4000)


Now let's set up the feature selector object

In [6]:
# Define the genetic algorithm feature selector
selector = GAFeatureSelectionCV(
    estimator=model,
    cv=10,  # Cross-validation folds
    scoring="neg_root_mean_squared_error",  # Fitness function (maximize accuracy)
    population_size=20,  # Number of individuals in the population
    max_features=300,
    generations=50,  # Number of generations
    n_jobs=-1,  # Use all available CPU cores
    verbose=False,  # Print progress
)

In [None]:
pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("features", selector ),
        ("elastic_net",  model )
    ], 
    memory = root+'\\cache',
    verbose=True
)

REGULARIZATION = np.logspace(-5, 0, 8)
MIXTURE = np.linspace(0.001, 1, 8)
PARAM_GRID = [
    {
        "elastic_net__alpha": REGULARIZATION,
        "elastic_net__l1_ratio": MIXTURE
    }
]

grid = GridSearchCV(estimator=pipe, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_10_0, error_score='raise')
grid.fit(X_train, bact_train)

In [8]:
print('Training RMSE:', round(abs(grid.score(X_train, bact_train)), 3))
print('Testing RMSE:', round(abs(grid.score(X_test, bact_test)), 3))

# Inverse-transforming the preds to get back to original scale.
# Used for comparison with R results
preds_unscaled = bact_scaler.inverse_transform(grid.predict(X_test).reshape(-1,1))
print('Testing RMSE, unscaled:', round(root_mean_squared_error(preds_unscaled, bact_test_unscaled), 3))

Training RMSE: 0.559
Testing RMSE: 0.777
Testing RMSE, unscaled: 0.343


In [9]:
print('Training RMSE:', round(abs(grid.score(X_train, bact_train)), 3))
print('Testing RMSE:', round(abs(grid.score(X_test, bact_test)), 3))

# Inverse-transforming the preds to get back to original scale.
# Used for comparison with R results
preds_unscaled = bact_scaler.inverse_transform(grid.predict(X_test).reshape(-1,1))
print('Testing RMSE, unscaled:', round(root_mean_squared_error(preds_unscaled, bact_test_unscaled), 3))

Training RMSE: 0.559
Testing RMSE: 0.777
Testing RMSE, unscaled: 0.343


In [10]:
best_pipe = grid.best_estimator_
selector = best_pipe.named_steps['features']

# Check if the selector has the 'best_features_' attribute
if hasattr(selector, "best_features_"):
    # Get the mask of selected features (True for selected, False for not selected)
    selected_features_mask = selector.best_features_

    # Get the feature names (if available)
    feature_names = X_train.columns if hasattr(X_train, "columns") else [f"Feature_{i}" for i in range(X_train.shape[1])]
    selected_feature_names = [name for name, selected in zip(feature_names, selected_features_mask) if selected]

    print(f"Selected Features Count: {len(selected_feature_names)}")
    print("Selected Features:", selected_feature_names)
else:
    print("The attribute 'best_features_' is not available.")


Selected Features Count: 258
Selected Features: ['Feature_2', 'Feature_7', 'Feature_13', 'Feature_47', 'Feature_53', 'Feature_57', 'Feature_69', 'Feature_75', 'Feature_89', 'Feature_100', 'Feature_104', 'Feature_119', 'Feature_126', 'Feature_142', 'Feature_156', 'Feature_160', 'Feature_176', 'Feature_177', 'Feature_178', 'Feature_181', 'Feature_185', 'Feature_201', 'Feature_206', 'Feature_217', 'Feature_234', 'Feature_239', 'Feature_244', 'Feature_251', 'Feature_257', 'Feature_270', 'Feature_283', 'Feature_296', 'Feature_301', 'Feature_322', 'Feature_329', 'Feature_334', 'Feature_335', 'Feature_342', 'Feature_352', 'Feature_364', 'Feature_378', 'Feature_383', 'Feature_395', 'Feature_400', 'Feature_404', 'Feature_412', 'Feature_415', 'Feature_421', 'Feature_422', 'Feature_437', 'Feature_450', 'Feature_453', 'Feature_455', 'Feature_459', 'Feature_477', 'Feature_478', 'Feature_485', 'Feature_512', 'Feature_513', 'Feature_518', 'Feature_529', 'Feature_542', 'Feature_547', 'Feature_582', 'F

*Testing models' results:*

For elastic_net random_state=0, training RMSE is 0.587, and testing RMSE is 0.766.

After removing the scaler, training RMSE is 0.491, and testing RMSE is 0.629

No scaler; optimizing R2: training RMSE is 0.236, testing RMSE is 0.935

No scaler; 10-fold CV: training RMSE 0.491, testing RMSE 0.629

With scaler; 10-fold CV; unscaling preds: training RMSE 0.587, testing RMSE 0.338 (on par with R results)

*Final model's results:*

Training RMSE: 0.587

Testing RMSE: 0.766

Testing RMSE, unscaled: 0.338 (on par with R results)