Reading in the data

In [1]:
import git
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv # Needed for HalvingGridSearchCV, which is experimental
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
# Add more imports in this block later. There will need to be several "from sklearn.whatever import something" lines

In [2]:
repo = git.Repo('.', search_parent_directories = True)
root = repo.working_tree_dir

# The sample id and the log-transformed gene expression values.
half_data_1 = pd.read_csv(root + '\\data\\RKNGHStress.csv')
half_data_1 = half_data_1.loc[:, half_data_1.columns.str.startswith(('Sample', 'Log'))]
half_data_1 = half_data_1.rename(columns = {'Sample' : 'sample', 'Log16S' : 'bact', 'Logcbblr' : 'cbblr', 'Log18S' : 'fungi', 'Logphoa' : 'phoa', 'Logurec' : 'urec'})

# The hyperspectral measurements for each sample
half_data_2 = pd.read_csv(root + '\\data\\RKNGHStressPCAPSR.csv')
half_data_2 = half_data_2.rename(columns = {'Unnamed: 0' : 'sample'})

data = half_data_1.join(half_data_2.set_index('sample'), on = 'sample')

TEMP: testing manual construction of models with specific hyperparameters

In [3]:
X = data.drop(['sample', 'bact', 'cbblr', 'fungi', 'phoa', 'urec'], axis = 1)
# NOTE: when doing phoa, there are a couple of samples (3 and 30) that have no data recorded, so we'll need to remove NAs there. But those observations still have data for the other genes.
bact = data[['bact']]

In [4]:
# Note: do NOT scale X and y before splitting, since that is a data leak. Instead, use the pipeline to scale both Xs and the y training, and manually scale the y testing for custom scoring like RMSE.
X_train, X_test, bact_train, bact_test = train_test_split(X.to_numpy(), bact.to_numpy(), train_size=0.8, random_state=0)

print('Before scaling:')
print('bact_train:', pd.DataFrame(bact_train).describe())
print()
print('bact_test:', pd.DataFrame(bact_test).describe())
print()
print()
bact_scaler = StandardScaler()
bact_train = bact_scaler.fit_transform(bact_train)
bact_test = bact_scaler.transform(bact_test)
print('After scaling:')
print('bact_train:', pd.DataFrame(bact_train).describe())
print()
print('bact_test:', pd.DataFrame(bact_test).describe())

# For the sake of robustness, maybe should repeat this a few times, with different random states (still manually set for sake of reproducibility) e.g., 0, 1, ... , 4
cv_0 = KFold(n_splits=5, shuffle=True, random_state=0)
cv_1 = KFold(n_splits=5, shuffle=True, random_state=1)
cv_2 = KFold(n_splits=5, shuffle=True, random_state=2)

Before scaling:
bact_train:                 0
count  318.000000
mean     9.766514
std      0.441960
min      8.939812
25%      9.358483
50%      9.750377
75%     10.183323
max     10.476234

bact_test:                0
count  80.000000
mean    9.870154
std     0.455018
min     9.151460
25%     9.378870
50%    10.029126
75%    10.225967
max    10.476234


After scaling:
bact_train:                   0
count  3.180000e+02
mean  -4.203486e-16
std    1.001576e+00
min   -1.873485e+00
25%   -9.246853e-01
50%   -3.656805e-02
75%    9.445811e-01
max    1.608379e+00

bact_test:                0
count  80.000000
mean    0.234872
std     1.031170
min    -1.393844
25%    -0.878484
50%     0.595137
75%     1.041220
max     1.608379


In [5]:
test_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("elastic_net", ElasticNet(fit_intercept=False, warm_start=True, random_state=0, selection='random', max_iter=4000))
    ],
    memory = root+'\\cache',
    verbose=True
)

REGULARIZATION = np.logspace(-5, 0, 8) # If this doesn't work, may have to enclose the RHS in list()
MIXTURE = np.linspace(0.001, 1, 8)
PARAM_GRID = [
    {
        "elastic_net__alpha": REGULARIZATION,
        "elastic_net__l1_ratio": MIXTURE
    }
]

grid = GridSearchCV(estimator=test_pipe, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_0, verbose=5, error_score='raise')
grid.fit(X_train, bact_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ....... (step 2 of 2) Processing elastic_net, total=   6.1s


In [6]:
print('Training RMSE:', round(abs(grid.score(X_train, bact_train)), 3))
print('Testing RMSE:', round(abs(grid.score(X_test, bact_test)), 3))
print()

print('Model preds:', np.round(grid.predict(X_test), 3))
print('True vals:', np.round(bact_test, 3))
print()

coeffs = grid.best_estimator_.named_steps['elastic_net'].coef_
print(np.round(coeffs, 3))
nonzeros = coeffs[coeffs != 0.0]
print(np.round(nonzeros, 3))

Training RMSE: 0.587
Testing RMSE: 0.766

Model preds: [ 0.18   0.728  0.448  1.089  0.684  1.23  -0.421  0.562  0.798 -1.207
 -0.609 -0.231  0.059  0.991 -0.091 -0.665 -0.587 -0.531  0.    -1.068
  0.115  0.108 -0.48   0.612 -0.681 -0.252 -0.656 -0.665 -0.673  0.463
 -0.808 -0.51  -0.013 -0.919 -0.565  0.581  0.774 -0.499 -1.143  0.425
  0.357  0.624 -0.222 -0.428 -0.803  0.478  0.212 -0.198  0.597  0.291
  0.321  0.354 -0.644  1.153  0.619  1.197  0.77  -0.398 -0.559  0.82
  0.072 -0.281  0.259  1.473  0.547  0.248  0.824  0.202  0.152  0.075
 -0.368  0.968  0.937 -0.241  1.013 -1.025 -0.754  0.415  0.123  0.664]
True vals: [[ 0.586]
 [ 1.411]
 [ 1.524]
 [ 0.604]
 [ 0.54 ]
 [ 0.978]
 [-1.147]
 [ 0.898]
 [ 1.016]
 [-0.925]
 [-1.113]
 [-0.761]
 [ 0.963]
 [ 1.286]
 [-1.017]
 [-1.147]
 [ 0.322]
 [-0.2  ]
 [ 1.57 ]
 [-1.183]
 [-1.361]
 [ 1.583]
 [ 1.583]
 [ 0.604]
 [-1.041]
 [-0.345]
 [-0.782]
 [-0.2  ]
 [-0.925]
 [ 1.411]
 [-0.871]
 [-0.988]
 [ 1.588]
 [-1.017]
 [-0.9  ]
 [ 1.583]
 [ 0.7

Changing the elastic net hyperparameter from positive=False to True (default) solved the all-zero coefficient problem. I also tweaked MIXTURE to avoid the endpoint of 0, and max_iter=4000 (default was 1000), since the previous options led to nonconvergence because of numerical/implementation inefficiencies behind the scenes. 

But there's still a problem with unexpectedly poor RMSE results. When normalizing y, gives RMSE about 0.7, and when not normalizing, RMSE about 9.8.

In [7]:
print('bact_train:', pd.DataFrame(bact_train).describe())
print()
print('bact_test:', pd.DataFrame(bact_test).describe())

bact_train:                   0
count  3.180000e+02
mean  -4.203486e-16
std    1.001576e+00
min   -1.873485e+00
25%   -9.246853e-01
50%   -3.656805e-02
75%    9.445811e-01
max    1.608379e+00

bact_test:                0
count  80.000000
mean    0.234872
std     1.031170
min    -1.393844
25%    -0.878484
50%     0.595137
75%     1.041220
max     1.608379


In [8]:
grid1 = GridSearchCV(estimator=test_pipe, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_1, verbose=5, error_score='raise')
grid1.fit(X_train, bact_train)

grid2 = GridSearchCV(estimator=test_pipe, param_grid=PARAM_GRID, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv_2, verbose=5, error_score='raise')
grid2.fit(X_train, bact_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


  model = cd_fast.enet_coordinate_descent(


[Pipeline] ....... (step 2 of 2) Processing elastic_net, total=   1.2s
Fitting 5 folds for each of 64 candidates, totalling 320 fits
[Pipeline] ....... (step 2 of 2) Processing elastic_net, total=   4.6s


In [9]:
print('Split 0 RMSE:', round(abs(grid.score(X_test, bact_test)), 3))
print('Split 1 RMSE:', round(abs(grid1.score(X_test, bact_test)), 3))
print('Split 2 RMSE:', round(abs(grid2.score(X_test, bact_test)), 3))

Split 0 RMSE: 0.766
Split 1 RMSE: 0.78
Split 2 RMSE: 0.766


After testing the same pipeline/grid search CV with the same params, but with different CV splits, the RMSEs were all about the same. (0.735, 0.758, and 0.735) So it's unlikely that the CV split itself is responsible for an unlucky high RMSE.

UPDATE: After correctly scaling bact_test according to bact_train's distribution, instead of bact_test's own, the RMSEs slightly bumped up to 0.766, 0.78, and 0.766. This slight "worsening" was expected since the previous results were slightly cheating.

In [10]:
print(REGULARIZATION)
print()
print(MIXTURE)

[1.00000000e-05 5.17947468e-05 2.68269580e-04 1.38949549e-03
 7.19685673e-03 3.72759372e-02 1.93069773e-01 1.00000000e+00]

[0.001      0.14371429 0.28642857 0.42914286 0.57185714 0.71457143
 0.85728571 1.        ]


TODO: Check to see if Pipeline is doing the process seen here (https://datascience.stackexchange.com/questions/81276/svr-rmse-is-much-worse-after-normalizing-the-data) behind the scenes. Basically, don't normalize X_test, for example, WRT itself either; normalize WRT X_train. Double check this though.

TODO: Check how transforming fits into the Pipeline process.