In [1]:
import git
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.inspection import permutation_importance

from sklearn_genetic import GAFeatureSelectionCV
# from sklearn_genetic.space import Categorical, Integer, Continuous

from math import ceil

**Preprocessing**

In [2]:
# Testing whether using data_consol.csv helps anything. If so, probably indicates an error in reading in or joining the separate CSVs before
repo = git.Repo('.', search_parent_directories = True)
root = repo.working_tree_dir

data_consol = pd.read_csv(root + '//data/data_consol.csv')

SEED = 0
rng = np.random.default_rng(SEED)

# Intended for reproducible GA steps
np.random.seed(SEED)
random.seed(SEED)

In [3]:
X = data_consol.filter(regex="^[0-9]+$")
bact = data_consol['pcr_bact_log']

# Note: do NOT scale X and y before splitting, since that is a data leak. Instead, use the pipeline to scale both Xs, and separately scale the y for custom scoring like RMSE.
X_train, X_test, bact_train_unscaled, bact_test_unscaled = train_test_split(X.to_numpy(), bact.to_numpy(), train_size=0.8, random_state=0)

# Reshaping necessary for the y scaling step
bact_train_unscaled = bact_train_unscaled.reshape(-1,1)
bact_test_unscaled = bact_test_unscaled.reshape(-1,1)

bact_scaler = StandardScaler()
bact_train = bact_scaler.fit_transform(bact_train_unscaled).reshape(-1,1)
bact_test = bact_scaler.transform(bact_test_unscaled).reshape(-1,1)

# 5-fold CV; random state 0
cv_5_0 = KFold(n_splits=5, shuffle=True, random_state=0)

# Used for waveband selection
wvs = np.arange(350,2501)

In [4]:
# Since this is only with respect to X_train, not any of the target variables, this only has to be computed once. (It's relatively cheap to compute, but this also has the benefit of preserving the random choices.)
def cluster(X_train):
    """ Uses agglomerative clustering with a distance threshold of 0.999 on the normalized feature correlation coefficient matrix. Then, it randomly selects one waveband from each cluster.
    This should be used as a preprocessing step when doing permutation importance. (Clustering method) """
    corr = np.corrcoef(X.T) # X needs to be transposed because of corrcoef's implementation
    agg = AgglomerativeClustering(n_clusters=None, distance_threshold=0.999) # The distance threshold is somewhat arbitrary, but it's based on EDA and domain knowledge, and the results seem reasonable.
    clusters = agg.fit_predict(corr)
    # Now select a single "representative" waveband from each cluster
    cluster_choices = []
    for i in range(np.max(clusters)):
        wv_in_cluster = wvs[clusters==i]
        cluster_choices.append(rng.choice(wv_in_cluster))
    cluster_choices = np.sort(np.array(cluster_choices))
    return cluster_choices

In [5]:
cluster_choices = cluster(X_train)

**The major pipeline components**

In [6]:
elastic_net = ElasticNet(fit_intercept=False, warm_start=True, random_state=0, selection='random', max_iter=4000)

# Used for embedded feature importance (via coeffs) and wrapper feature importance (via perm importance)
pipe_elastic_net = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("elastic_net", elastic_net)
    ],
    memory = root+'\\cache',
    verbose=True
)

# Hyperparameters for elastic net tuning. When code is finalized, expand for more thorough search using more computational resources.
REGULARIZATION = np.logspace(-5, 0, 8)
MIXTURE = np.linspace(0.001, 1, 8)
PARAM_GRID = [
    {
        "elastic_net__alpha": REGULARIZATION,
        "elastic_net__l1_ratio": MIXTURE
    }
]

# A basic 

**Feature selection functions**

In [7]:
def mi(X_train, y_train, n_features=64):
    """ Uses mutual information to calculate the n_features most related features in X_train to y_train. (Filter method) """
    y_train = y_train.ravel()
    mi = mutual_info_regression(X_train, y_train)
    top_n_idx = np.argpartition(mi, -n_features)[-n_features:]
    return wvs[top_n_idx]

In [8]:
def train_elastic_net(X_train, y_train):
    """ Builds and fits an elastic net model using all features. 
    Returns the fit estimator (a pipeline). Used within coeffs() and ga(). """
    grid = GridSearchCV(estimator=pipe_elastic_net, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_5_0, error_score='raise')
    grid.fit(X_train, y_train)
    return grid.best_estimator_

In [9]:
def coeffs(estimator, n_features=64):
    """ Builds and fits an elastic net model using all features. Returns the n_features features with the highest absolute-valued coefficients. (Embedded method) """
    coeffs = estimator['elastic_net'].coef_
    abs_coeffs = np.abs(coeffs)
    top_n_idx = np.argpartition(abs_coeffs, -n_features)[-n_features:]
    return wvs[top_n_idx]

In [10]:
def ga(X_train, y_train, trained_estimator, n_features=64, wv_subset=None):
    """ Uses a genetic algorithm to find the wavebands that gives the lowest RMSE on an elastic net model. 
    The subset will be at most n_features large, but it may be less than n_features large. 
    wv_subset should be None when in the feature selection layer, but when in the consolidation layer, it should
    be the subset of possible wavelengths output by the concatenated feature selection methods, not the entire
    [350,2500] set. (GA method) """
    
    y_train = y_train.ravel()
    ga_selector = GAFeatureSelectionCV(
        estimator=trained_estimator,
        cv=cv_5_0,  # Cross-validation folds
        scoring="neg_root_mean_squared_error",  # Fitness function (maximize accuracy)
        population_size=n_features*2,  # Number of individuals in the population
        generations=20,  # Number of generations
        n_jobs=-1,  # Use all available CPU cores
        verbose=True,  # Print progress
        max_features=n_features,
        return_train_score=True,
        refit=False,
        crossover_probability=0.75,
        mutation_probability=0.2
    )
    pipe_ga = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("ga", ga_selector)
        ], 
        memory = root+'\\cache',
        verbose=True
    )
    
    pipe_ga.fit(X_train, y_train)
    feats = pipe_ga['ga'].best_features_ # A mask of the features selected from X_train

    # Should be the case in the feature selection layer
    if wv_subset is None:
        return wvs[feats]
    # Should be the case in the consensus layer
    else:
        return wv_subset[feats]

In [11]:
def perm_imp(X_train, y_train, n_features=64):
    """ Calculates permutation importance on a dataset. cluster_choices should be the result of calling cluster(), which should be done once at the start of execution. 
    This is done outside this function to preserve the random selection. Returns the set of n_features wavebands with the highest permutation importance on the training set. (Wrapper method) """
    # Use only the features selected by clustering
    cluster_idx = cluster_choices - 350
    X_train = X_train[:,cluster_idx]
    # Build and train another elastic net model, but only on the features left after clustering, to use for permutation importance.
    pipe = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("elastic_net", elastic_net)
        ], 
        memory = root+'\\cache',
        verbose=True
    )    
    grid = GridSearchCV(estimator=pipe, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_5_0, error_score='raise')
    grid.fit(X_train, y_train)
    perm_imp = permutation_importance(grid, X_train, y_train, scoring='neg_root_mean_squared_error', n_repeats=10, n_jobs=-1, random_state=0)
    pi_top_64_idx = np.argpartition(perm_imp.importances_mean, -64)[-64:]
    return cluster_choices[pi_top_64_idx]

**Consensus function**

In [12]:
def consensus(X_train, y_train, n_features_intermed=64, max_features_output=8):
    """ Takes the wavebands output by the feature selection functions and uses a (separate) genetic algorithm to find the wavebands that give the lowest RMSE on an elastic net model.
    The subset will be at most n_features large, but it may be less than n_features large.
    Returns the tuple: (wv_mi, wv_coeffs, wv_ga, wv_cluster, wv_perm_imp, wv_consensus), where each is a numpy array of wavebands that were selected by each method.  (Consensus method) """
    
    # TODO: implement caching check like in R to avoid repeated computations
    print('\tFEATURE SELECTION:')
    print('\tStarting mutual importance...', end=' ')
    wv_mi = mi(X_train, y_train, n_features=n_features_intermed)
    print('Done.')
    print('\tTraining the elastic net model...', end=' ')
    trained_pipe = train_elastic_net(X_train, y_train)
    wv_coeffs = coeffs(trained_pipe, n_features=n_features_intermed)
    print('Done.')
    print('\tStarting genetic algorithm...', end=' ')
    wv_ga = ga(X_train, y_train, trained_pipe, n_features=n_features_intermed)
    print('Done.')
    print('\tStarting permutation importance...', end=' ')
    wv_cluster = cluster_choices # Doesn't require a separate function call
    wv_perm_imp = perm_imp(X_train, y_train, n_features=n_features_intermed)
    print('Done.')

    # Compile the above results into one array, remove any duplicates, and sort.
    wv_intermed = np.append(wv_mi, wv_coeffs)
    wv_intermed = np.append(wv_intermed, wv_ga)
    wv_intermed = np.append(wv_intermed, wv_cluster)
    wv_intermed = np.append(wv_intermed, wv_perm_imp)
    wv_intermed = np.sort(np.unique(wv_intermed))

    print('wv_mi:', wv_mi)
    print('wv_coeffs:', wv_coeffs)
    print('wv_ga:', wv_ga)
    print('wv_cluster:', wv_cluster)
    print('wv_perm_imp:', wv_perm_imp)

    # Convert the above into indices for masking over the dataset.
    wv_intermed_idx = wv_intermed-350
    X_train = X_train[:,wv_intermed_idx]

    # Use another genetic algorithm to find the best wavebands out of the narrowed possibilities
    print('\tCONSENSUS:')
    print('\tStarting genetic algorithm...', end=' ')
    wv_consensus = ga(X_train, y_train, trained_pipe, n_features=max_features_output, wv_subset=wv_intermed)
    print('\tDone.')
    return (wv_mi, wv_coeffs, wv_ga, wv_cluster, wv_perm_imp, wv_consensus)

In [13]:
wv_mi, wv_coeffs, wv_ga, wv_cluster, wv_perm_imp, wv_consensus = consensus(X_train, bact_train)

	FEATURE SELECTION:
	Starting mutual importance... Done.
	Training the elastic net model... [Pipeline] ....... (step 2 of 2) Processing elastic_net, total=   5.7s
Done.
	Starting genetic algorithm... In ga():
X_train.shape: (318, 2151)
y_train.shape: (318,)
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	128   	-53906.7	49846.7    	-0.892467  	-100000    
1  	243   	-22657  	41860.3    	-0.892467  	-100000    
2  	243   	-3125.9 	17399.1    	-0.875887  	-100000    
3  	243   	-1563.4 	12401.8    	-0.875403  	-100000    
4  	244   	-0.912162	0.0130469  	-0.875403  	-0.939178  
5  	246   	-0.904743	0.0120599  	-0.875403  	-0.953242  
6  	243   	-1563.38 	12401.8    	-0.871437  	-100000    
7  	240   	-0.890547	0.0105686  	-0.866715  	-0.923079  
8  	242   	-0.88305 	0.0105751  	-0.863732  	-0.923587  
9  	244   	-0.874639	0.00871097 	-0.8538    	-0.898806  
10 	245   	-782.112 	8804.16    	-0.850496  	-100000    
11 	243   	-0.86323 	0.00775632 	-0.844016  	-0.886279  
12 	24

  model = cd_fast.enet_coordinate_descent(


[Pipeline] ....... (step 2 of 2) Processing elastic_net, total=   0.1s
Done.
wv_mi: [1621 1435 1646 2002 1482 1643 2005  722 1483 2330 1441 2312 2006 1642
 2254 2257 1645 1622 1437 1442 1445 1443 2314 2258  727 2004  716 1639
 1436 1444 1644 1638 1641 2262 2007 1623 1637 1640 2313 1624 2008 1886
 1636 2259 2261 1635 1632 1634 2260  717 1625 1633  983 1626 1630 1631
 1885 1629  720 1628 1627  719  718  354]
wv_coeffs: [1948 2347  430  382 2350  394  396  364  472 1925  981 1947 1867 1871
  428  366  479 2349 1884 1902 2348 1939  971 1868 1870  476 1934 1916
 1938  429 1869 1946  473  365 1899 1917  980  477  395  973 1890  478
 1945 1940 1935 1937  979 1936 1901 1944  978 1900  975  977 1941 1943
  976 1885  974 1942 1889 1886 1888 1887]
wv_ga: [ 364  471  473  476  498  519  619  677  708  712  743  764  841  880
  970  973  976  978  980  986 1017 1076 1161 1295 1359 1369 1378 1408
 1416 1428 1434 1438 1440 1512 1523 1565 1627 1743 1818 1843 1852 1853
 1868 1869 1887 1890 1921 1941 19

In [19]:
print(wv_consensus)
wv_consensus_idx = wv_consensus - 350

[1150 1430 1442 1746 1884 1944 2044 2409]


**Train a model on each subset for validation**

In [26]:
validator = GridSearchCV(estimator=pipe_elastic_net, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_5_0, error_score='raise')
validator.fit(X_train[:,wv_consensus_idx], bact_train)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ....... (step 2 of 2) Processing elastic_net, total=   0.0s


  model = cd_fast.enet_coordinate_descent(


In [27]:
print('Training RMSE:', validator.score(X_train[:,wv_consensus_idx], bact_train))
print('Testing RMSE:', validator.score(X_test[:,wv_consensus_idx], bact_test))

Training RMSE: -0.9044502239222266
Testing RMSE: -0.9602222547928215


In [28]:
print(validator.best_estimator_['elastic_net'].coef_)

[ 1.17899391  6.05527982 -2.32535073 -1.93627568 -2.91641005 -0.31421716
 -0.95956144  1.27540174]


**Investigate results**