In [34]:
import git
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.inspection import permutation_importance
from sklearn_genetic import GAFeatureSelectionCV
from sklearn_genetic.space import Categorical, Integer, Continuous

**Preprocessing**

In [2]:
# Testing whether using data_consol.csv helps anything. If so, probably indicates an error in reading in or joining the separate CSVs before
repo = git.Repo('.', search_parent_directories = True)
root = repo.working_tree_dir

data_consol = pd.read_csv(root + '//data/data_consol.csv')

In [3]:
X = data_consol.filter(regex="^[0-9]+$")
bact = data_consol['pcr_bact_log']

# Note: do NOT scale X and y before splitting, since that is a data leak. Instead, use the pipeline to scale both Xs, and separately scale the y for custom scoring like RMSE.
X_train, X_test, bact_train_unscaled, bact_test_unscaled = train_test_split(X.to_numpy(), bact.to_numpy(), train_size=0.8, random_state=0)

# Reshaping necessary for the y scaling step
bact_train_unscaled = bact_train_unscaled.reshape(-1,1)
bact_test_unscaled = bact_test_unscaled.reshape(-1,1)

bact_scaler = StandardScaler()
bact_train = bact_scaler.fit_transform(bact_train_unscaled).reshape(-1,1)
bact_test = bact_scaler.transform(bact_test_unscaled).reshape(-1,1)

# 10-fold CV; random state 0
cv_5_0 = KFold(n_splits=5, shuffle=True, random_state=0)

rng = np.random.default_rng(0)

**The major pipeline components**

In [4]:
# Define the genetic algorithm feature selector
elastic_net = ElasticNet(fit_intercept=False, warm_start=True, random_state=0, selection='random', max_iter=4000)

ga_selector = GAFeatureSelectionCV(
    estimator=elastic_net,
    cv=cv_5_0,  # Cross-validation folds
    scoring="neg_root_mean_squared_error",  # Fitness function (maximize accuracy)
    population_size=20,  # Number of individuals in the population
    generations=50,  # Number of generations
    n_jobs=-1,  # Use all available CPU cores
    verbose=True,  # Print progress
    max_features = 32
)

pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        #("features", ga_selector) ,
        ("elastic_net", elastic_net)
    ], 
    memory = root+'\\cache',
    verbose=True
)

REGULARIZATION = np.logspace(-5, 0, 8)
MIXTURE = np.linspace(0.001, 1, 8)
PARAM_GRID = [
    {
        "elastic_net__alpha": REGULARIZATION,
        "elastic_net__l1_ratio": MIXTURE
    }
]

grid = GridSearchCV(estimator=pipe, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_5_0, error_score='raise')

**Train the model(s)**

In [5]:
# grid.fit(X_train, bact_train)

**Investigate results**

In [6]:
# print('Training RMSE:', round(abs(grid.score(X_train, bact_train)), 3))
# print('Testing RMSE:', round(abs(grid.score(X_test, bact_test)), 3))

# # Inverse-transforming the preds to get back to original scale.
# # Used for comparison with R results
# preds_unscaled = bact_scaler.inverse_transform(grid.predict(X_test).reshape(-1,1))
# print('Testing RMSE, unscaled:', round(root_mean_squared_error(preds_unscaled, bact_test_unscaled), 3))

In [7]:
wvs = np.arange(350,2501)

In [8]:
# Filter: using mutual information criterion
# mi = mutual_info_regression(X_train, bact_train.ravel())
# mi_top_64_idx = np.argpartition(mi, -64)[-64:]
# print(wvs[mi_top_64_idx])

In [9]:
# Coeffs (embedded method) choices

# coeffs = grid.best_estimator_['elastic_net'].coef_
# print(coeffs)
# print()

# abs_coeffs = np.abs(coeffs)
# print(abs_coeffs)
# print()

# print(np.argsort(abs_coeffs))
# print()

# top_64_idx = np.argpartition(coeffs, -64)[-64:]
# print(wvs[top_64_idx])

In [10]:
# GA choices
# feats = best_pipe.named_steps['features'].best_features_
# print(wvs[feats])

In [22]:
# Before using permutation importance, need to cluster based on correlation to reduce multicollinearity.
corr = np.corrcoef(X.T) # X needs to be transposed because of corrcoef's implementation
agg = AgglomerativeClustering(n_clusters=None, distance_threshold=0.999) # The distance threshold is somewhat arbitrary, but it's based on EDA and domain knowledge, and the results seem reasonable.
clusters = agg.fit_predict(corr)

# Now select a single "representative" waveband from each cluster
cluster_choices = []
for i in range(np.max(clusters)):
    wv_in_cluster = wvs[clusters==i]
    cluster_choices.append(rng.choice(wv_in_cluster))
cluster_choices = np.sort(np.array(cluster_choices))
cluster_choices_idx = cluster_choices-350

In [58]:
# Build and train another elastic net model, but only on the features left after clustering, to use for permutation importance.
pipe_perm_imp = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("elastic_net", elastic_net)
    ], 
    memory = root+'\\cache',
    verbose=True
)

grid_perm_imp = GridSearchCV(estimator=pipe_perm_imp, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_5_0, error_score='raise')
grid_perm_imp.fit(X_train[:,cluster_choices_idx], bact_train)
perm_imp = permutation_importance(grid_perm_imp, X_train[:,cluster_choices_idx], bact_train, scoring='neg_root_mean_squared_error', n_repeats=10, n_jobs=-1, random_state=0)

pi_top_64_idx = np.argpartition(perm_imp.importances_mean, -64)[-64:]
print(cluster_choices[pi_top_64_idx])

  model = cd_fast.enet_coordinate_descent(


[Pipeline] ....... (step 2 of 2) Processing elastic_net, total=   0.1s
[1899 2285 1978 1572  717 1599 2062 2068  365 2138 1407  402 1888  991
  450 1974  715 1882 1990 1400  529 2128 2119 2335  563 2140 1865 1887
 2333 1226 1900 2279 1857 1959  782  921 1716 1964 1832 1396 2315  396
 2092 1798  729 1651 2242 1372  961 1634 2020  683 1850  728 2255 1870
  744 1415 1885  541 1847 1667 1946 1942]
