# Train GBR in mRNA stability data

In [1]:
import numpy as np
from numpy.random import random_integers
import pandas as pd

# sklearn imports
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline


# my module imports
from optimalcodon.projects.rnastability.dataprocessing import get_data, general_preprocesing_pipeline2
from optimalcodon.projects.rnastability import modelevaluation

In [2]:
(train_x, train_y), (test_x, test_y) = get_data("results_data/")
print("{} points for training and {} for testing with {} features".format(
    train_x.shape[0], test_x.shape[0], test_x.shape[1]))

# pre-processing

preprocessing = general_preprocesing_pipeline2(train_x)

preprocessing.fit(train_x)
train_x_transformed = preprocessing.transform(train_x)

train_x_transformed.shape

67832 points for training and 7536 for testing with 8 features


(67832, 82)

In [5]:
gbm_grid = {'loss':['huber'],
          'learning_rate':[0.1, 0.055, 0.01, 0.0055, 0.001],
          'n_estimators':[500, 700,1000, 1200, 1500, 1700, 2000],
          'min_samples_split':[4, 6, 8],
          'min_samples_leaf':[2, 3],
          'max_depth':[10, 15, 17, 20],
          'max_features':['log2']}

# make cross validation to shuffle the data
cross_val = KFold(n_splits=4, shuffle=True, random_state=42)

model = GradientBoostingRegressor()
random_cv = RandomizedSearchCV(estimator=model,
                               param_distributions=gbm_grid,
                               cv=cross_val, n_iter=50, n_jobs=30,verbose=10,
                               scoring='r2')

random_cv.fit(train_x_transformed, train_y)

Fitting 4 folds for each of 50 candidates, totalling 200 fits


[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:  9.9min
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed: 40.2min
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed: 90.3min
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed: 158.4min
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed: 222.8min
[Parallel(n_jobs=30)]: Done  68 tasks      | elapsed: 283.3min
[Parallel(n_jobs=30)]: Done  85 tasks      | elapsed: 348.9min
[Parallel(n_jobs=30)]: Done 102 tasks      | elapsed: 368.4min
[Parallel(n_jobs=30)]: Done 121 tasks      | elapsed: 418.2min
[Parallel(n_jobs=30)]: Done 140 tasks      | elapsed: 473.8min
[Parallel(n_jobs=30)]: Done 162 out of 200 | elapsed: 537.9min remaining: 126.2min
[Parallel(n_jobs=30)]: Done 183 out of 200 | elapsed: 587.5min remaining: 54.6min
[Parallel(n_jobs=30)]: Done 200 out of 200 | elapsed: 742.0min finished


RandomizedSearchCV(cv=KFold(n_splits=4, random_state=42, shuffle=True),
                   error_score='raise-deprecating',
                   estimator=GradientBoostingRegressor(alpha=0.9,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                               

In [6]:
random_cv.best_score_

0.369271501664413

In [7]:
random_cv.best_params_


{'n_estimators': 1700,
 'min_samples_split': 8,
 'min_samples_leaf': 3,
 'max_features': 'log2',
 'max_depth': 10,
 'loss': 'huber',
 'learning_rate': 0.01}

In [8]:
models = {
    'gbm': random_cv.best_estimator_
}
modelevaluation.eval_models(models, preprocessing, test_x, test_y).to_csv("results_data/val_gbm.csv")

generating predictions for model: gbm


In [9]:
modelevaluation.crossvalidation(models, train_x_transformed, train_y).to_csv('results_data/cv_gbm.csv', index=False)

cv for model: gbm


In [10]:
pwd

'/n/projects/smedina/projectos/190108-mzt-rna-stability/results/19-07-18-PredictiveModelWithM6AandMicroRNAs'