# 00: List of models

## Setup

In [7]:
import pickle

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet, Lars, Lasso, LinearRegression, MultiTaskLasso 
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import cross_validate, LeaveOneGroupOut
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

import bin.params as p
import bin.utils as utils
import bin.baseline_models as bm
from bin.msvr.model.MSVR import MSVR

In [8]:
# developer settings
N_JOBS = -1
VERBOSE = 0
ERROR_SCORE = 'raise'
RANDOM_STATE = 2

In [9]:
MODELS_FILE_PATH = f'{p.DATA_DIR}/pickles/models.p'

---

## Model definitions

**Note: It is not possible `_` (underscore) character in model keys - that would confuse the Snakemake rule parser**

In [11]:
adaboosts = {
    1: AdaBoostRegressor(n_estimators=1, random_state=RANDOM_STATE),
    2: AdaBoostRegressor(n_estimators=2, random_state=RANDOM_STATE),
    4: AdaBoostRegressor(n_estimators=4, random_state=RANDOM_STATE),
    5: AdaBoostRegressor(n_estimators=5, random_state=RANDOM_STATE),
}
gb = GradientBoostingRegressor(n_estimators=2, random_state=RANDOM_STATE)

# ------
# BEWARE - do not use _ character in the `models` dictionary keys!!!
# Doing so would confuse Snakefile script and would result in errors
# ------
models = {
    # baseline
    'BLavgpos': bm.AverageForResidueAtPosition(),
    'BLmeansamerespos': bm.StatisticForSameResidueAtPosition(statistic='mean'),
    'BLmediansamerespos': bm.StatisticForSameResidueAtPosition(statistic='median'),
    'BLknnwholeseqn3': bm.KNNWholeSequence(n_neighbors=3),
    'BLknnwholeseqn10': bm.KNNWholeSequence(n_neighbors=10),
    
    # sklearn linear models
    'linreg': LinearRegression(fit_intercept=True), # problem
    'lars': Lars(), # problem
    'elasticnet': ElasticNet(), # ???
    'lasso': Lasso(), # problem
    'multilasso': MultiTaskLasso(), # problem
    
    # misc
    'kernelridge': KernelRidge(), # OK
    'gaussianprocess': GaussianProcessRegressor(), # ??? dlho bezi

    # svr
    'svr': MSVR(), # https://github.com/Analytics-for-Forecasting/msvr # OK
    'morsvr': MultiOutputRegressor(SVR(), n_jobs=N_JOBS), # ???
    'isotonicreg': IsotonicRegression(), # problem
    
    # trees
    'gradientboostingmor': MultiOutputRegressor(gb, n_jobs=N_JOBS),
    'gradientboostingrc': RegressorChain(gb),
    'randomforest': RandomForestRegressor(n_estimators=5, n_jobs=N_JOBS), # ok
    'randomforest2': RandomForestRegressor(n_estimators=15, n_jobs=N_JOBS), # ok
    'randomforest3': RandomForestRegressor(n_estimators=30, n_jobs=N_JOBS), # ok
    'randomForestN2': RandomForestRegressor(n_estimators=2, n_jobs=N_JOBS), # ok
    'randomForestN3': RandomForestRegressor(n_estimators=3, n_jobs=N_JOBS), # ok
    'randomForestN5': RandomForestRegressor(n_estimators=5, n_jobs=N_JOBS), # ok
    'randomForestN15': RandomForestRegressor(n_estimators=15, n_jobs=N_JOBS), # ok
    'randomForestN30': RandomForestRegressor(n_estimators=30, n_jobs=N_JOBS), # ok
    'decisiontree': DecisionTreeRegressor(), # ok
    'adaboost': RegressorChain(adaboosts[1]), # hmmm
    'adaboostmor': MultiOutputRegressor(adaboosts[1], n_jobs=N_JOBS), # hmm2
    'adaboostmor5': MultiOutputRegressor(adaboosts[5], n_jobs=N_JOBS), # hmm2
    'adaboostMorN2': MultiOutputRegressor(adaboosts[2], n_jobs=N_JOBS),
    'adaboostMorN4': MultiOutputRegressor(adaboosts[4], n_jobs=N_JOBS),
    'adaboostMorN5': MultiOutputRegressor(adaboosts[5], n_jobs=N_JOBS),
}

# check for invalid _ characters
assert all('_' not in model_key for model_key in models.keys()), 'model keys cannot contain _ character, that would confuse Snakemake rule parser'

# store the models
with open(MODELS_FILE_PATH, 'wb') as models_file:
    pickle.dump(models, models_file)