In [1]:
import git
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn_genetic import GAFeatureSelectionCV
from sklearn_genetic.space import Categorical, Integer, Continuous

**Preprocessing**

In [2]:
# Testing whether using data_consol.csv helps anything. If so, probably indicates an error in reading in or joining the separate CSVs before
repo = git.Repo('.', search_parent_directories = True)
root = repo.working_tree_dir

data_consol = pd.read_csv(root + '//data/data_consol.csv')

In [3]:
X = data_consol.filter(regex="^[0-9]+$")
bact = data_consol['pcr_bact_log']

# Note: do NOT scale X and y before splitting, since that is a data leak. Instead, use the pipeline to scale both Xs, and separately scale the y for custom scoring like RMSE.
X_train, X_test, bact_train_unscaled, bact_test_unscaled = train_test_split(X.to_numpy(), bact.to_numpy(), train_size=0.8, random_state=0)

# Reshaping necessary for the y scaling step
bact_train_unscaled = bact_train_unscaled.reshape(-1,1)
bact_test_unscaled = bact_test_unscaled.reshape(-1,1)

bact_scaler = StandardScaler()
bact_train = bact_scaler.fit_transform(bact_train_unscaled).reshape(-1,1)
bact_test = bact_scaler.transform(bact_test_unscaled).reshape(-1,1)

# 10-fold CV; random state 0
cv_5_0 = KFold(n_splits=5, shuffle=True, random_state=0)

**The major pipeline components**

In [4]:
# # Define the genetic algorithm feature selector
# elastic_net = ElasticNet(fit_intercept=False, warm_start=True, random_state=0, selection='random', max_iter=4000)

# ga_selector = GAFeatureSelectionCV(
#     estimator=elastic_net,
#     cv=cv_5_0,  # Cross-validation folds
#     scoring="neg_root_mean_squared_error",  # Fitness function (maximize accuracy)
#     population_size=20,  # Number of individuals in the population
#     generations=50,  # Number of generations
#     n_jobs=-1,  # Use all available CPU cores
#     verbose=True,  # Print progress
#     max_features = 32
# )

# pipe = Pipeline(
#     [
#         ("scaler", StandardScaler()),
#         #("features", ga_selector) ,
#         ("elastic_net", elastic_net)
#     ], 
#     memory = root+'\\cache',
#     verbose=True
# )

# REGULARIZATION = np.logspace(-5, 0, 8)
# MIXTURE = np.linspace(0.001, 1, 8)
# PARAM_GRID = [
#     {
#         "elastic_net__alpha": REGULARIZATION,
#         "elastic_net__l1_ratio": MIXTURE
#     }
# ]

# grid = GridSearchCV(estimator=pipe, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_5_0, error_score='raise')

**Train the model(s)**

In [5]:
# grid.fit(X_train, bact_train)

**Investigate results**

In [6]:
# print('Training RMSE:', round(abs(grid.score(X_train, bact_train)), 3))
# print('Testing RMSE:', round(abs(grid.score(X_test, bact_test)), 3))

# # Inverse-transforming the preds to get back to original scale.
# # Used for comparison with R results
# preds_unscaled = bact_scaler.inverse_transform(grid.predict(X_test).reshape(-1,1))
# print('Testing RMSE, unscaled:', round(root_mean_squared_error(preds_unscaled, bact_test_unscaled), 3))

In [7]:
wvs = np.arange(350,2501)

In [33]:
# Filter: using mutual information criterion
mi = mutual_info_regression(X_train, bact_train.ravel())
mi_top_64_idx = np.argpartition(mi, -64)[-64:]
print(wvs[mi_top_64_idx])

[2256 2003 2002 1438 1435 1482 2005 2312 2006  727 1441 1621 1483 1445
 2254 1646 2314 1643 1642 1443 1444 1442 2257 1622 2262 1645 1436 1437
 2004 2007 1638 1639 1644 2258 1641 2313 1623 1637 1640 2008  717 2261
 1624 1886 1636  983 1635 2259 1632 1634 2260 1625 1633 1626  720 1631
 1629 1630 1885 1628 1627  719  718  354]


In [8]:
# Coeffs (embedded method) choices

# coeffs = grid.best_estimator_['elastic_net'].coef_
# print(coeffs)
# print()

# abs_coeffs = np.abs(coeffs)
# print(abs_coeffs)
# print()

# print(np.argsort(abs_coeffs))
# print()

# top_64_idx = np.argpartition(coeffs, -64)[-64:]
# print(wvs[top_64_idx])

In [9]:
# GA choices
# feats = best_pipe.named_steps['features'].best_features_
# print(wvs[feats])

In [10]:
# # Before using permutation importance, need to correlate

# corr = np.corrcoef(X.T)
# print(corr)
# print()
# from sklearn.cluster import AgglomerativeClustering
# agg = AgglomerativeClustering(n_clusters=16)
# clusters = agg.fit_predict(corr)
# print(clusters)
# print()
# print(np.unique_counts(clusters))

[[1.         0.99113472 0.96988146 ... 0.29156386 0.29157205 0.29117255]
 [0.99113472 1.         0.99171728 ... 0.33020638 0.33037111 0.33011075]
 [0.96988146 0.99171728 1.         ... 0.36985384 0.37027856 0.3702527 ]
 ...
 [0.29156386 0.33020638 0.36985384 ... 1.         0.99951765 0.99833201]
 [0.29157205 0.33037111 0.37027856 ... 0.99951765 1.         0.99962948]
 [0.29117255 0.33011075 0.3702527  ... 0.99833201 0.99962948 1.        ]]

[14 14 14 ... 13 13 13]

UniqueCountsResult(values=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]), counts=array([201, 231, 317, 156,  43,  82, 101,   7,  58,  17, 413, 164, 170,
        93,   4,  94]))
