In [15]:
import numpy as np
import pandas as pd

# sklearn import
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_decomposition import PLSRegression

# cross-validation technique
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import GridSearchCV, GroupKFold, cross_val_score


# my module imports
from optimalcodon.projects.rnastability.dataprocessing import get_data, general_preprocesing_pipeline
from optimalcodon.projects.rnastability import modelevaluation

In [11]:
(train_x, train_y), (test_x, test_y) = get_data("../19-04-30-PredictiveModelDecayAllSpecies/19-04-30-EDA/results_data/")

# I will test groped cross validation technique
groups_cv = train_x.index.values
print("{} points for training and {} for testing with {} features".format(
    train_x.shape[0], test_x.shape[0], test_x.shape[1]))

67817 points for training and 7534 for testing with 6 features


In [12]:
preprocessing = Pipeline([
    ('general', general_preprocesing_pipeline(train_x)), # see the code for general_preprocesing_pipeline
    ('polyfeaturs', PolynomialFeatures(degree=2)),
    ('zerovar', VarianceThreshold(threshold=0.0)),
    ('scaling', StandardScaler()) # I scale again not all polynomial features may be with scaled
])


preprocessing.fit(train_x)
train_x_transformed = preprocessing.transform(train_x)

In [23]:
lm_reg = Pipeline([
    ('lm', LinearRegression())
])

lm_grid = dict()

In [24]:
cross_val = GroupKFold(n_splits=10)
r2score = make_scorer(r2_score)
grid_search = GridSearchCV(estimator=lm_reg, param_grid=lm_grid,
                               n_jobs=4, cv=cross_val, verbose=10, scoring=r2score)

In [27]:
grid_search.fit(train_x_transformed, train_y, groups=groups_cv)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of  10 | elapsed: 10.9min remaining: 10.9min
[Parallel(n_jobs=4)]: Done   7 out of  10 | elapsed: 10.9min remaining:  4.7min
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 13.6min finished


GridSearchCV(cv=GroupKFold(n_splits=10), error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('lm', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))]),
       fit_params=None, iid='warn', n_jobs=4, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(r2_score), verbose=10)

In [34]:
grid_search.best_score_

0.1286173537241034

In [47]:
# how do we used it here
cross_val = GroupKFold(n_splits=5)
cross_val = cross_val.split(train_x_transformed, train_y, groups=groups_cv)

r2score = make_scorer(r2_score)

In [48]:
res = cross_val_score(grid_search.best_estimator_, train_x_transformed, train_y, cv=cross_val, n_jobs=4, scoring=r2score)