# Train GradientBoostinRegressor to generate the predictions intervals


Here, we train the Gradient Boosting that optimizes the `loss="quantile", alpha=0.5` that is the predicted median, the idea is to find the best set of hyperparameters that optimize this metric. Then I will train with the found hyperparameters a model to produce the upper and lower prediction intervals.

Here, I determine the optimal hyperparameter values.

In [1]:
import numpy as np
from numpy.random import random_integers
import pandas as pd

# sklearn imports
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold

# my module imports
from optimalcodon.projects.rnastability.dataprocessing import get_data, general_preprocesing_pipeline
from optimalcodon.projects.rnastability import modelevaluation

In [6]:
# LOAD TRAINING DATA

(train_x, train_y), (test_x, test_y) = get_data("../19-04-30-PredictiveModelDecayAllSpecies/19-04-30-EDA/results_data/")

print("{} points for training and {} for testing with {} features".format(
    train_x.shape[0], test_x.shape[0], test_x.shape[1]))

# pre-processing

preprocessing = general_preprocesing_pipeline(train_x)

preprocessing.fit(train_x)
train_x_transformed = preprocessing.transform(train_x)

train_x_transformed.shape

67817 points for training and 7534 for testing with 6 features


(67817, 80)

In [8]:
gbm_grid = {
          'learning_rate':[0.1, 0.055, 0.01, 0.0055, 0.001],
          'n_estimators':[500, 700,1000, 1200, 1500, 1700, 2000],
          'min_samples_split':[4, 6, 8],
          'min_samples_leaf':[2, 3],
          'max_depth':[10, 15, 17, 20],
          'max_features':['log2']}

# make cross validation to shuffle the data
cross_val = KFold(n_splits=5, shuffle=True, random_state=42)

model = GradientBoostingRegressor(loss='quantile', alpha=0.5)
random_cv = RandomizedSearchCV(estimator=model,
                               param_distributions=gbm_grid,
                               cv=cross_val, n_iter=70, n_jobs=30,verbose=10,
                               scoring='r2')

random_cv.fit(train_x_transformed, train_y)

Fitting 5 folds for each of 70 candidates, totalling 350 fits


[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed: 17.9min
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed: 68.7min
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed: 116.7min
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed: 156.7min
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed: 196.4min
[Parallel(n_jobs=30)]: Done  68 tasks      | elapsed: 222.0min
[Parallel(n_jobs=30)]: Done  85 tasks      | elapsed: 286.4min
[Parallel(n_jobs=30)]: Done 102 tasks      | elapsed: 348.2min
[Parallel(n_jobs=30)]: Done 121 tasks      | elapsed: 405.1min
[Parallel(n_jobs=30)]: Done 140 tasks      | elapsed: 471.2min
[Parallel(n_jobs=30)]: Done 161 tasks      | elapsed: 528.0min
[Parallel(n_jobs=30)]: Done 182 tasks      | elapsed: 596.5min
[Parallel(n_jobs=30)]: Done 205 tasks      | elapsed: 666.6min
[Parallel(n_jobs=30)]: Done 228 tasks      | elapsed: 776.5min
[Parallel(n_jobs=30)]: Done 253 tasks      

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
                   error_score='raise-deprecating',
                   estimator=GradientBoostingRegressor(alpha=0.5,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='quantile',
                                                       max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_sam

In [9]:
random_cv.best_score_

0.36542181074940194

In [10]:
random_cv.best_params_

{'n_estimators': 1700,
 'min_samples_split': 4,
 'min_samples_leaf': 3,
 'max_features': 'log2',
 'max_depth': 17,
 'learning_rate': 0.01}

In [11]:
random_cv.best_estimator_

GradientBoostingRegressor(alpha=0.5, criterion='friedman_mse', init=None,
                          learning_rate=0.01, loss='quantile', max_depth=17,
                          max_features='log2', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=3, min_samples_split=4,
                          min_weight_fraction_leaf=0.0, n_estimators=1700,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)