## Learning Curve

In [27]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import learning_curve, KFold

from optimalcodon.projects.rnastability import dataprocessing
from optimalcodon.projects.rnastability import dataprocessing


## 1. Define the estimator object

In [None]:
# define the estimator object
(train_x, train_y), (test_x, test_Y) = dataprocessing.get_data("../19-04-30-EDA/results_data")
preprocessing = dataprocessing.general_preprocesing_pipeline(train_x)


gbm = GradientBoostingRegressor(
        alpha=0.9,
        criterion='friedman_mse',
        init=None,
        learning_rate=0.01,
        loss='huber',
        max_depth=10,
        max_features='log2',
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        min_samples_leaf=3,
        min_samples_split=8,
        min_weight_fraction_leaf=0.0,
        n_estimators=2000,
        n_iter_no_change=None,
        presort='auto',
        random_state=None,
        subsample=1.0,
        tol=0.0001,
        validation_fraction=0.1,
        verbose=0,
        warm_start=False)

estimator = Pipeline([('pre-processing', preprocessing),
                                    ('gbm', gbm)])

## 2. Cross Validation Strategy

In [32]:
## CROSS VALIDATION STRATEGY
## we use the whole data set for this analysis
Y = train_y.append(test_Y)
X = pd.concat([train_x, test_x])

cv = KFold(n_splits=10, shuffle=True, random_state=42)

## 3. Learning Curve

In [33]:
train_sizes, train_scores, test_scores = learning_curve(
    estimator=estimator,
    cv=cv,
    X=X,
    y=Y,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='r2',
    n_jobs=30,
    verbose=10,
    shuffle=True,
    random_state=42)

[learning_curve] Training set sizes: [ 6781 13563 20344 27126 33907 40689 47470 54252 61033 67815]


[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   2 out of 100 | elapsed:  3.6min remaining: 175.9min
[Parallel(n_jobs=30)]: Done  13 out of 100 | elapsed: 12.0min remaining: 80.1min
[Parallel(n_jobs=30)]: Done  24 out of 100 | elapsed: 18.9min remaining: 59.7min
[Parallel(n_jobs=30)]: Done  35 out of 100 | elapsed: 27.2min remaining: 50.5min
[Parallel(n_jobs=30)]: Done  46 out of 100 | elapsed: 35.9min remaining: 42.1min
[Parallel(n_jobs=30)]: Done  57 out of 100 | elapsed: 43.7min remaining: 33.0min
[Parallel(n_jobs=30)]: Done  68 out of 100 | elapsed: 51.0min remaining: 24.0min
[Parallel(n_jobs=30)]: Done  79 out of 100 | elapsed: 58.5min remaining: 15.5min
[Parallel(n_jobs=30)]: Done  90 out of 100 | elapsed: 68.8min remaining:  7.6min
[Parallel(n_jobs=30)]: Done 100 out of 100 | elapsed: 84.3min finished


In [35]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [44]:
pd.DataFrame({
    'train_sizes': train_sizes,
    'train_scores_mean': train_scores_mean,
    'train_scores_std': train_scores_std,
    'test_scores_mean': test_scores_mean,
    'test_scores_std': test_scores_std
}).to_csv("results_data/learning_curve.csv", index=False)