In [88]:
import numpy as np
import pandas as pd
from scipy.stats import randint as sp_randint

# sklearn import
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, r2_score
# my module imports
from optimalcodon.projects.rnastability.dataprocessing import get_data, general_preprocesing_pipeline
from optimalcodon.projects.rnastability import modelevaluation

In [2]:
(train_x, train_y), (test_x, test_y) = get_data("../19-04-30-EDA/results_data/")

In [7]:
# pre-process Pipeline

preprocessing = general_preprocesing_pipeline(train_x)

preprocessing.fit(train_x)
train_x_transformed = preprocessing.transform(train_x)

***
## Decision Tree Regressor

In [49]:
np.arange(10, 25)

array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])

In [50]:
tree_reg = DecisionTreeRegressor()

tree_grid = {
    'min_samples_split': np.linspace(0.001, .03, 10),
    'max_features': [None],
    'splitter': ['best'],
    'max_depth': np.arange(10, 25)
}

tree_search = modelevaluation.gridsearch(tree_reg, tree_grid, train_x_transformed, train_y, cores=15)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done   2 tasks      | elapsed:    4.3s
[Parallel(n_jobs=15)]: Done  11 tasks      | elapsed:    4.9s
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    8.2s
[Parallel(n_jobs=15)]: Done  31 tasks      | elapsed:   12.4s
[Parallel(n_jobs=15)]: Done  42 tasks      | elapsed:   13.2s
[Parallel(n_jobs=15)]: Done  55 tasks      | elapsed:   17.1s
[Parallel(n_jobs=15)]: Done  68 tasks      | elapsed:   22.0s
[Parallel(n_jobs=15)]: Done  83 tasks      | elapsed:   25.8s
[Parallel(n_jobs=15)]: Done  98 tasks      | elapsed:   30.5s
[Parallel(n_jobs=15)]: Done 115 tasks      | elapsed:   34.9s
[Parallel(n_jobs=15)]: Done 132 tasks      | elapsed:   40.1s
[Parallel(n_jobs=15)]: Done 151 tasks      | elapsed:   48.5s
[Parallel(n_jobs=15)]: Done 170 tasks      | elapsed:   52.9s
[Parallel(n_jobs=15)]: Done 191 tasks      | elapsed:   59.0s
[Parallel(n_jobs=15)]: Done 212 tasks      | elapsed:  

Best Score R2 =  0.10929953538618693
Best Parameters:  {'max_depth': 14, 'max_features': None, 'min_samples_split': 0.007444444444444444, 'splitter': 'best'}


I will use the best parameter of the decision tree to train a random forest.

***

## Random Forest

In [82]:
rf_reg = RandomForestRegressor(max_depth=14, n_jobs=8)

rf_grid = {
    'n_estimators': np.arange(1700, 2000, 50)
}

rf_search = modelevaluation.gridsearch(rf_reg, rf_grid, train_x_transformed, train_y, cores=2)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  7.9min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed: 17.7min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 42.1min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 73.1min
[Parallel(n_jobs=2)]: Done  21 out of  21 | elapsed: 118.0min remaining:    0.0s
[Parallel(n_jobs=2)]: Done  21 out of  21 | elapsed: 118.0min finished


Best Score R2 =  0.29296783933619036
Best Parameters:  {'n_estimators': 1750}


In [83]:
modelevaluation.eval_models({'rf': rf_search.best_estimator_}, preprocessing, test_x, test_y).to_csv("res3.csv")

generating predictions for model: rf


***

## ADA BOOST

In [102]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4))
ada_grid = {
    'n_estimators': np.arange(10, 300, 100)
}
ada_search = modelevaluation.gridsearch(ada_reg, ada_grid, train_x_transformed, train_y, cores=17)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=17)]: Using backend LokyBackend with 17 concurrent workers.
[Parallel(n_jobs=17)]: Done   2 out of   9 | elapsed:   41.6s remaining:  2.4min
[Parallel(n_jobs=17)]: Done   3 out of   9 | elapsed:   42.7s remaining:  1.4min
[Parallel(n_jobs=17)]: Done   4 out of   9 | elapsed:   49.5s remaining:  1.0min
[Parallel(n_jobs=17)]: Done   5 out of   9 | elapsed:   53.7s remaining:   42.9s
[Parallel(n_jobs=17)]: Done   6 out of   9 | elapsed:   54.7s remaining:   27.4s
[Parallel(n_jobs=17)]: Done   7 out of   9 | elapsed:   58.3s remaining:   16.6s
[Parallel(n_jobs=17)]: Done   9 out of   9 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=17)]: Done   9 out of   9 | elapsed:  1.0min finished


Best Score R2 =  0.01858620666091429
Best Parameters:  {'n_estimators': 50}


In [120]:
models = {
    'decision tree': tree_search.best_estimator_,
    'AdaBoost': ada_search.best_estimator_,
    'random forest': rf_search.best_estimator_.set_params(n_jobs=2) # set params for cross validation
}
modelevaluation.eval_models(models, preprocessing, test_x, test_y).to_csv("results_data/val_non-Treemodels.csv")

generating predictions for model: decision tree
generating predictions for model: AdaBoost
generating predictions for model: random forest


In [121]:
modelevaluation.crossvalidation(models, train_x_transformed, train_y).to_csv('results_data/cv_Tree-models.csv', index=False)

cv for model: decision tree
cv for model: AdaBoost
cv for model: random forest
