In [1]:
import git
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn_genetic import GAFeatureSelectionCV
from sklearn_genetic.space import Categorical, Integer, Continuous

**Preprocessing**

In [2]:
# Testing whether using data_consol.csv helps anything. If so, probably indicates an error in reading in or joining the separate CSVs before
repo = git.Repo('.', search_parent_directories = True)
root = repo.working_tree_dir

data_consol = pd.read_csv(root + '//data/data_consol.csv')

In [3]:
X = data_consol.filter(regex="^[0-9]+$")
bact = data_consol['pcr_bact_log']

# Note: do NOT scale X and y before splitting, since that is a data leak. Instead, use the pipeline to scale both Xs, and separately scale the y for custom scoring like RMSE.
X_train, X_test, bact_train_unscaled, bact_test_unscaled = train_test_split(X.to_numpy(), bact.to_numpy(), train_size=0.8, random_state=0)

# Reshaping necessary for the y scaling step
bact_train_unscaled = bact_train_unscaled.reshape(-1,1)
bact_test_unscaled = bact_test_unscaled.reshape(-1,1)

bact_scaler = StandardScaler()
bact_train = bact_scaler.fit_transform(bact_train_unscaled).reshape(-1,1)
bact_test = bact_scaler.transform(bact_test_unscaled).reshape(-1,1)

# 10-fold CV; random state 0
cv_5_0 = KFold(n_splits=5, shuffle=True, random_state=0)

**The major pipeline components**

In [4]:
# Define the genetic algorithm feature selector
elastic_net = ElasticNet(fit_intercept=False, warm_start=True, random_state=0, selection='random', max_iter=4000)

ga_selector = GAFeatureSelectionCV(
    estimator=elastic_net,
    cv=cv_5_0,  # Cross-validation folds
    scoring="neg_root_mean_squared_error",  # Fitness function (maximize accuracy)
    population_size=20,  # Number of individuals in the population
    generations=50,  # Number of generations
    n_jobs=-1,  # Use all available CPU cores
    verbose=True,  # Print progress
    max_features = 32
)

pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        #("features", ga_selector) ,
        ("elastic_net", elastic_net)
    ], 
    memory = root+'\\cache',
    verbose=True
)

REGULARIZATION = np.logspace(-5, 0, 8)
MIXTURE = np.linspace(0.001, 1, 8)
PARAM_GRID = [
    {
        "elastic_net__alpha": REGULARIZATION,
        "elastic_net__l1_ratio": MIXTURE
    }
]

grid = GridSearchCV(estimator=pipe, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_5_0, error_score='raise')

**Train the model(s)**

In [5]:
grid.fit(X_train, bact_train)

[Pipeline] ....... (step 2 of 2) Processing elastic_net, total=   7.3s


**Investigate results**

In [6]:
print('Training RMSE:', round(abs(grid.score(X_train, bact_train)), 3))
print('Testing RMSE:', round(abs(grid.score(X_test, bact_test)), 3))

# Inverse-transforming the preds to get back to original scale.
# Used for comparison with R results
preds_unscaled = bact_scaler.inverse_transform(grid.predict(X_test).reshape(-1,1))
print('Testing RMSE, unscaled:', round(root_mean_squared_error(preds_unscaled, bact_test_unscaled), 3))

Training RMSE: 0.587
Testing RMSE: 0.766
Testing RMSE, unscaled: 0.338


In [34]:
wvs = np.arange(350,2501)

coeffs = grid.best_estimator_['elastic_net'].coef_
print(coeffs)
print()

abs_coeffs = np.abs(coeffs)
print(abs_coeffs)
print()

print(np.argsort(abs_coeffs))
print()

top_4_idx = np.argpartition(coeffs, -4)[-4:]
print(wvs[top_4_idx])

[ 0.14315325 -0.07610088 -0.01352258 ... -0.07548139 -0.05154376
 -0.00956238]

[0.14315325 0.07610088 0.01352258 ... 0.07548139 0.05154376 0.00956238]

[ 555  522  526 ... 1536 1538 1537]

[ 477  478 1901 1900]


*Temp results tracker*

max_features=16 ->
Training RMSE: 0.949
Testing RMSE: 0.984
Testing RMSE, unscaled: 0.434
[ 426  712 1295 1446 1878 2073 2423]

max_features=None ->
Training RMSE: 0.546
Testing RMSE: 0.771
Testing RMSE, unscaled: 0.34
967 features chosen

In [None]:
# GA choices
# feats = best_pipe.named_steps['features'].best_features_
# print(wvs[feats])