***
## GBM tuning

In [6]:
import numpy as np
from numpy.random import random_integers
import pandas as pd

# sklearn imports
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold

# my module imports
from optimalcodon.projects.rnastability.dataprocessing import get_data, general_preprocesing_pipeline
from optimalcodon.projects.rnastability import modelevaluation

In [7]:
(train_x, train_y), (test_x, test_y) = get_data("../19-04-30-EDA/results_data/")
print("{} points for training and {} for testing with {} features".format(
    train_x.shape[0], test_x.shape[0], test_x.shape[1]))

# pre-processing

preprocessing = general_preprocesing_pipeline(train_x)

preprocessing.fit(train_x)
train_x_transformed = preprocessing.transform(train_x)

train_x_transformed.shape

67817 points for training and 7534 for testing with 6 features


(67817, 80)

In [8]:
gbm_grid = {'loss':['huber'],
          'learning_rate':[0.1, 0.055, 0.01, 0.0055, 0.001],
          'n_estimators':[500, 700,1000, 1200, 1500, 1700, 2000],
          'min_samples_split':[4, 6, 8],
          'min_samples_leaf':[2, 3],
          'max_depth':[10, 15, 17, 20],
          'max_features':['log2']}

# make cross validation to shuffle the data
cross_val = KFold(n_splits=5, shuffle=True, random_state=42)

model = GradientBoostingRegressor()
random_cv = RandomizedSearchCV(estimator=model,
                               param_distributions=gbm_grid,
                               cv=cross_val, n_iter=70, n_jobs=30,verbose=10,
                               scoring='r2')

random_cv.fit(train_x_transformed, train_y)

Fitting 5 folds for each of 70 candidates, totalling 350 fits


[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:  7.3min
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed: 68.1min
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed: 101.2min
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed: 246.2min
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed: 342.2min
[Parallel(n_jobs=30)]: Done  68 tasks      | elapsed: 410.4min
[Parallel(n_jobs=30)]: Done  85 tasks      | elapsed: 451.2min
[Parallel(n_jobs=30)]: Done 102 tasks      | elapsed: 482.9min
[Parallel(n_jobs=30)]: Done 121 tasks      | elapsed: 533.8min
[Parallel(n_jobs=30)]: Done 140 tasks      | elapsed: 597.4min
[Parallel(n_jobs=30)]: Done 161 tasks      | elapsed: 684.5min
[Parallel(n_jobs=30)]: Done 182 tasks      | elapsed: 755.6min
[Parallel(n_jobs=30)]: Done 205 tasks      | elapsed: 805.5min
[Parallel(n_jobs=30)]: Done 228 tasks      | elapsed: 862.9min
[Parallel(n_jobs=30)]: Done 253 tasks      

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
          error_score='raise-deprecating',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=70, n_jobs=30,
          param_distributions={'loss': ['huber'], 'learning_rate': [0.1, 0.055, 0.01, 0.0055, 0.001], 'n_estimators': [500, 700, 1000, 1200, 1500, 1700, 2000], 'min_samples_split': [4, 6, 8], 'min_samples_leaf': [2, 3], 'max_depth': [10, 15, 17, 20], 'max_features': ['log2']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='r2', verbose=10)

In [9]:
random_cv.best_score_

0.3700031529927131

In [10]:
random_cv.best_params_

{'n_estimators': 2000,
 'min_samples_split': 8,
 'min_samples_leaf': 3,
 'max_features': 'log2',
 'max_depth': 10,
 'loss': 'huber',
 'learning_rate': 0.01}

In [11]:
random_cv.best_estimator_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='huber', max_depth=10,
             max_features='log2', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=3, min_samples_split=8,
             min_weight_fraction_leaf=0.0, n_estimators=2000,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [12]:
models = {
    'gbm': random_cv.best_estimator_
}
modelevaluation.eval_models(models, preprocessing, test_x, test_y).to_csv("results_data/val_gbm.csv")

generating predictions for model: gbm


In [13]:
modelevaluation.crossvalidation(models, train_x_transformed, train_y).to_csv('results_data/cv_gbm.csv', index=False)

cv for model: gbm


In [14]:
pd.DataFrame(random_cv.cv_results_).to_csv("results_data/tuning_params_gbm.csv", index=False)

