# Learning Curves for models

In [8]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import learning_curve, GroupKFold

from optimalcodon.projects.rnastability.dataprocessing import get_data, general_preprocesing_pipeline

In [9]:
# 1 DATA PRE-PROCESSING

(train_x, train_y), (test_x, test_y) = get_data(
    '../data/191004-TrainAndTestSets/')
print("{} points for training and {} for testing with {} features".format(
    train_x.shape[0], test_x.shape[0], test_x.shape[1]))

groups = train_x.index.values

67775 points for training and 7576 for testing with 6 features


## General cross-validation strategy


In [10]:
# function
def MYlearning_curve(mdl_id_name, train_x, train_y, estimator):
    """
    Args:
        mdl_id_name (str): id to identify model
        train_x: training predictors, should be pre-processed for the
            particular model to be evaluated
        train_y: training labels
        estimator: model
        cv: grouped k-fold
        
        
    """
    cv = GroupKFold(n_splits=5).split(train_x, train_y, groups=groups)
    
    print('generating learning curve for ' + mdl_id_name)
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator=estimator,
        cv=cv,
        X=train_x,
        y=train_y,
        train_sizes=np.linspace(0.1, 1, 5),
        scoring='r2',
        n_jobs=25,
        verbose=10
    )
    
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    learning_curve_results = pd.DataFrame({
        'train_sizes': train_sizes,
        'train_scores_mean': train_scores_mean,
        'train_scores_std': train_scores_std,
        'test_scores_mean': test_scores_mean,
        'test_scores_std': test_scores_std
    })
    
    learning_curve_results['model'] = mdl_id_name
    return learning_curve_results



***
## Linear Models

In [11]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_decomposition import PLSRegression

linear_models = dict(
    lasso="../191004-TrainPredictiveModelsMrnaStability/results_data/trained_models/lasso.joblib",
    enet="../191004-TrainPredictiveModelsMrnaStability/results_data/trained_models/enet.joblib",
    linear="../191004-TrainPredictiveModelsMrnaStability/results_data/trained_models/linear_reg.joblib",
    pls ="../191004-TrainPredictiveModelsMrnaStability/results_data/trained_models/PLS.joblib"
)

# load the models
linear_models = {x:joblib.load(linear_models[x]) for x in linear_models}

For this linear model I use a specific preprocessing pipeline to add 2nd degree polynomial. Next, I define this preprocessing pipeline.

In [12]:
# pre-processing

preprocessing = Pipeline([
    ('general', general_preprocesing_pipeline(train_x)), # see the code for general_preprocesing_pipeline
    ('polyfeaturs', PolynomialFeatures(degree=2)),
    ('zerovar', VarianceThreshold(threshold=0.0)),
    ('scaling', StandardScaler()) # I scale again not all polynomial features may be with scaled
])


preprocessing.fit(train_x)
train_x_transformed_for_linear = preprocessing.transform(train_x)

In [13]:
res_linear = []

for mdl_id, estimator in linear_models.items():
    tmp_res = MYlearning_curve(mdl_id, train_x_transformed_for_linear, train_y, estimator)
    res_linear.append(tmp_res)

res_linear = pd.concat(res_linear)

generating learning curve for lasso
[learning_curve] Training set sizes: [ 5422 17621 29821 42020 54220]


[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done   2 out of  25 | elapsed:   41.1s remaining:  7.9min
[Parallel(n_jobs=25)]: Done   5 out of  25 | elapsed:   43.8s remaining:  2.9min
[Parallel(n_jobs=25)]: Done   8 out of  25 | elapsed:   50.8s remaining:  1.8min
[Parallel(n_jobs=25)]: Done  11 out of  25 | elapsed:   56.2s remaining:  1.2min
[Parallel(n_jobs=25)]: Done  14 out of  25 | elapsed:  1.1min remaining:   50.3s
[Parallel(n_jobs=25)]: Done  17 out of  25 | elapsed:  1.1min remaining:   30.4s
[Parallel(n_jobs=25)]: Done  20 out of  25 | elapsed:  1.1min remaining:   16.6s
[Parallel(n_jobs=25)]: Done  23 out of  25 | elapsed:  1.2min remaining:    6.3s
[Parallel(n_jobs=25)]: Done  25 out of  25 | elapsed:  1.5min finished


generating learning curve for enet
[learning_curve] Training set sizes: [ 5422 17621 29821 42020 54220]


[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done   2 out of  25 | elapsed:   40.7s remaining:  7.8min
[Parallel(n_jobs=25)]: Done   5 out of  25 | elapsed:   46.5s remaining:  3.1min
[Parallel(n_jobs=25)]: Done   8 out of  25 | elapsed:  1.3min remaining:  2.8min
[Parallel(n_jobs=25)]: Done  11 out of  25 | elapsed:  1.8min remaining:  2.3min
[Parallel(n_jobs=25)]: Done  14 out of  25 | elapsed:  2.0min remaining:  1.5min
[Parallel(n_jobs=25)]: Done  17 out of  25 | elapsed:  2.5min remaining:  1.2min
[Parallel(n_jobs=25)]: Done  20 out of  25 | elapsed:  2.8min remaining:   41.5s
[Parallel(n_jobs=25)]: Done  23 out of  25 | elapsed:  3.1min remaining:   16.2s
[Parallel(n_jobs=25)]: Done  25 out of  25 | elapsed:  3.4min finished


generating learning curve for linear
[learning_curve] Training set sizes: [ 5422 17621 29821 42020 54220]


[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done   2 out of  25 | elapsed:  1.1min remaining: 13.0min
[Parallel(n_jobs=25)]: Done   5 out of  25 | elapsed:  1.3min remaining:  5.3min
[Parallel(n_jobs=25)]: Done   8 out of  25 | elapsed:  1.6min remaining:  3.5min
[Parallel(n_jobs=25)]: Done  11 out of  25 | elapsed:  2.0min remaining:  2.5min
[Parallel(n_jobs=25)]: Done  14 out of  25 | elapsed:  2.3min remaining:  1.8min
[Parallel(n_jobs=25)]: Done  17 out of  25 | elapsed:  2.5min remaining:  1.2min
[Parallel(n_jobs=25)]: Done  20 out of  25 | elapsed:  2.6min remaining:   39.3s
[Parallel(n_jobs=25)]: Done  23 out of  25 | elapsed:  3.0min remaining:   15.6s
[Parallel(n_jobs=25)]: Done  25 out of  25 | elapsed:  3.5min finished


generating learning curve for pls
[learning_curve] Training set sizes: [ 5422 17621 29821 42020 54220]


[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done   2 out of  25 | elapsed:   26.7s remaining:  5.1min
[Parallel(n_jobs=25)]: Done   5 out of  25 | elapsed:   28.6s remaining:  1.9min
[Parallel(n_jobs=25)]: Done   8 out of  25 | elapsed:   35.5s remaining:  1.3min
[Parallel(n_jobs=25)]: Done  11 out of  25 | elapsed:   40.7s remaining:   51.8s
[Parallel(n_jobs=25)]: Done  14 out of  25 | elapsed:   51.2s remaining:   40.2s
[Parallel(n_jobs=25)]: Done  17 out of  25 | elapsed:   52.0s remaining:   24.5s
[Parallel(n_jobs=25)]: Done  20 out of  25 | elapsed:   54.4s remaining:   13.6s
[Parallel(n_jobs=25)]: Done  23 out of  25 | elapsed:   56.5s remaining:    4.9s
[Parallel(n_jobs=25)]: Done  25 out of  25 | elapsed:  1.3min finished


In [14]:
res_linear['type'] = "linear"
res_linear.to_csv('results-data/lc-linear.csv', index=False)

***

# Non-linear and Tree models


In [4]:
non_linear_models = dict(
    knn="../191004-TrainPredictiveModelsMrnaStability/results_data/trained_models/knn.joblib",
    adaBoost="../191004-TrainPredictiveModelsMrnaStability/results_data/trained_models/AdaBoost.joblib",
    decisionTree="../191004-TrainPredictiveModelsMrnaStability/results_data/trained_models/decision tree.joblib",
    gbm="../191004-TrainPredictiveModelsMrnaStability/results_data/trained_models/gbm.joblib",
    randomforest="../191004-TrainPredictiveModelsMrnaStability/results_data/trained_models/random forest.joblib"
    
)
# load the models
non_linear_models = {x:joblib.load(non_linear_models[x]) for x in non_linear_models}

# reset the params for random forest
non_linear_models['randomforest'] = non_linear_models['randomforest'].set_params(n_jobs=1)

For the next model we use the general pre-processing pipeline:

In [5]:
general_pipeline = general_preprocesing_pipeline(train_x)
train_x_transformed = general_pipeline.transform(train_x)

In [6]:
res_nonlinear = []

for mdl_id, estimator in non_linear_models.items():
    tmp_res = MYlearning_curve(mdl_id, train_x_transformed, train_y, estimator)
    res_nonlinear.append(tmp_res)

res_nonlinear = pd.concat(res_nonlinear)

generating learning curve for knn
[learning_curve] Training set sizes: [ 5422 17621 29821 42020 54220]


[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done   2 out of  25 | elapsed:   49.4s remaining:  9.5min
[Parallel(n_jobs=25)]: Done   5 out of  25 | elapsed:   55.3s remaining:  3.7min
[Parallel(n_jobs=25)]: Done   8 out of  25 | elapsed:  4.3min remaining:  9.0min
[Parallel(n_jobs=25)]: Done  11 out of  25 | elapsed:  8.3min remaining: 10.6min
[Parallel(n_jobs=25)]: Done  14 out of  25 | elapsed:  9.7min remaining:  7.6min
[Parallel(n_jobs=25)]: Done  17 out of  25 | elapsed: 16.0min remaining:  7.5min
[Parallel(n_jobs=25)]: Done  20 out of  25 | elapsed: 18.3min remaining:  4.6min
[Parallel(n_jobs=25)]: Done  23 out of  25 | elapsed: 24.0min remaining:  2.1min
[Parallel(n_jobs=25)]: Done  25 out of  25 | elapsed: 26.8min finished


generating learning curve for adaBoost
[learning_curve] Training set sizes: [ 5422 17621 29821 42020 54220]


[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done   2 out of  25 | elapsed:   46.0s remaining:  8.8min
[Parallel(n_jobs=25)]: Done   5 out of  25 | elapsed:   46.7s remaining:  3.1min
[Parallel(n_jobs=25)]: Done   8 out of  25 | elapsed:  2.3min remaining:  4.9min
[Parallel(n_jobs=25)]: Done  11 out of  25 | elapsed:  3.7min remaining:  4.7min
[Parallel(n_jobs=25)]: Done  14 out of  25 | elapsed:  4.0min remaining:  3.1min
[Parallel(n_jobs=25)]: Done  17 out of  25 | elapsed:  5.3min remaining:  2.5min
[Parallel(n_jobs=25)]: Done  20 out of  25 | elapsed:  5.6min remaining:  1.4min
[Parallel(n_jobs=25)]: Done  23 out of  25 | elapsed:  6.7min remaining:   35.1s
[Parallel(n_jobs=25)]: Done  25 out of  25 | elapsed:  6.9min finished


generating learning curve for decisionTree
[learning_curve] Training set sizes: [ 5422 17621 29821 42020 54220]


[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done   2 out of  25 | elapsed:    1.4s remaining:   15.7s
[Parallel(n_jobs=25)]: Done   5 out of  25 | elapsed:    2.2s remaining:    8.7s
[Parallel(n_jobs=25)]: Done   8 out of  25 | elapsed:    2.5s remaining:    5.4s
[Parallel(n_jobs=25)]: Done  11 out of  25 | elapsed:    2.9s remaining:    3.7s
[Parallel(n_jobs=25)]: Done  14 out of  25 | elapsed:    3.6s remaining:    2.8s
[Parallel(n_jobs=25)]: Done  17 out of  25 | elapsed:    4.7s remaining:    2.2s
[Parallel(n_jobs=25)]: Done  20 out of  25 | elapsed:    5.2s remaining:    1.3s
[Parallel(n_jobs=25)]: Done  23 out of  25 | elapsed:    7.3s remaining:    0.6s
[Parallel(n_jobs=25)]: Done  25 out of  25 | elapsed:    8.6s finished


generating learning curve for gbm
[learning_curve] Training set sizes: [ 5422 17621 29821 42020 54220]


[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done   2 out of  25 | elapsed:  3.8min remaining: 43.5min
[Parallel(n_jobs=25)]: Done   5 out of  25 | elapsed:  4.0min remaining: 15.8min
[Parallel(n_jobs=25)]: Done   8 out of  25 | elapsed:  8.3min remaining: 17.6min
[Parallel(n_jobs=25)]: Done  11 out of  25 | elapsed: 12.3min remaining: 15.6min
[Parallel(n_jobs=25)]: Done  14 out of  25 | elapsed: 12.8min remaining: 10.0min
[Parallel(n_jobs=25)]: Done  17 out of  25 | elapsed: 19.2min remaining:  9.0min
[Parallel(n_jobs=25)]: Done  20 out of  25 | elapsed: 20.8min remaining:  5.2min
[Parallel(n_jobs=25)]: Done  23 out of  25 | elapsed: 25.6min remaining:  2.2min
[Parallel(n_jobs=25)]: Done  25 out of  25 | elapsed: 27.2min finished


generating learning curve for randomforest
[learning_curve] Training set sizes: [ 5422 17621 29821 42020 54220]


[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done   2 out of  25 | elapsed:  5.7min remaining: 65.3min
[Parallel(n_jobs=25)]: Done   5 out of  25 | elapsed:  5.9min remaining: 23.5min
[Parallel(n_jobs=25)]: Done   8 out of  25 | elapsed: 19.7min remaining: 41.8min
[Parallel(n_jobs=25)]: Done  11 out of  25 | elapsed: 32.1min remaining: 40.9min
[Parallel(n_jobs=25)]: Done  14 out of  25 | elapsed: 34.8min remaining: 27.3min
[Parallel(n_jobs=25)]: Done  17 out of  25 | elapsed: 46.6min remaining: 21.9min
[Parallel(n_jobs=25)]: Done  20 out of  25 | elapsed: 49.0min remaining: 12.3min
[Parallel(n_jobs=25)]: Done  23 out of  25 | elapsed: 59.4min remaining:  5.2min
[Parallel(n_jobs=25)]: Done  25 out of  25 | elapsed: 60.1min finished


In [7]:
res_nonlinear['type'] = "non-linear"
res_nonlinear.to_csv('results-data/lc-nonlinear.csv', index=False)