# Train final model: Elastic-Net

In [11]:
import numpy as np
import pandas as pd
import jobliblib

# sklearn import
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold


# my module imports
from optimalcodon.projects.rnastability.dataprocessing import get_data, general_preprocesing_pipeline
from optimalcodon.projects.rnastability import modelevaluation

In [2]:
(train_x, train_y), (test_x, test_y) = get_data('../data/191004-TrainAndTestSets/')
print("{} points for training and {} for testing with {} features".format(
    train_x.shape[0], test_x.shape[0], test_x.shape[1]))

# groups for grouped cross validation
groups = train_x.index.values

67775 points for training and 7576 for testing with 6 features


In [3]:
# DATA PRE-PROCESSING
# Predictor pipeline

preprocessing = Pipeline([
    ('general', general_preprocesing_pipeline(train_x)), # see the code for general_preprocesing_pipeline
    ('polyfeaturs', PolynomialFeatures(degree=2)),
    ('zerovar', VarianceThreshold(threshold=0.0)),
    ('scaling', StandardScaler()) # I scale again not all polynomial features may be with scaled
])

In [4]:
preprocessing.fit(train_x)
train_x_transformed = preprocessing.transform(train_x)

In [5]:
# TODO: increase 5 to 20
lasso = Lasso()
alphas = np.logspace(-10, -0.00001, 50)
lasso_grid = [{'alpha': alphas}]
lasso_search = modelevaluation.gridsearch(lasso, lasso_grid, train_x_transformed, train_y, groups, n_splits=8)

Fitting 8 folds for each of 50 candidates, totalling 400 fits


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   8 tasks      | elapsed: 16.9min
[Parallel(n_jobs=32)]: Done  21 tasks      | elapsed: 18.4min
[Parallel(n_jobs=32)]: Done  34 tasks      | elapsed: 32.6min
[Parallel(n_jobs=32)]: Done  49 tasks      | elapsed: 35.6min
[Parallel(n_jobs=32)]: Done  64 tasks      | elapsed: 41.7min
[Parallel(n_jobs=32)]: Done  81 tasks      | elapsed: 54.6min
[Parallel(n_jobs=32)]: Done  98 tasks      | elapsed: 65.5min
[Parallel(n_jobs=32)]: Done 117 tasks      | elapsed: 74.7min
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed: 89.7min
[Parallel(n_jobs=32)]: Done 157 tasks      | elapsed: 98.2min
[Parallel(n_jobs=32)]: Done 178 tasks      | elapsed: 113.2min
[Parallel(n_jobs=32)]: Done 201 tasks      | elapsed: 125.7min
[Parallel(n_jobs=32)]: Done 224 tasks      | elapsed: 134.9min
[Parallel(n_jobs=32)]: Done 249 tasks      | elapsed: 147.9min
[Parallel(n_jobs=32)]: Done 274 tasks      | elapse

Best Score R2 =  0.1851893866926994
Best Parameters:  {'alpha': 0.005689764427226137}


In [14]:
# SAVE THE MODEL
joblib.dump(lasso_search.best_estimator_, 'results-data/best-model-gridserach.joblib')

['best-model-gridserach.joblib']