In [1]:
#imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from hyperopt import hp
from hyperopt.pyll import scope
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline, FeatureUnion

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel
from models.utils import CustomDataFrameMapper
from models.utils import _tree_features_transformations, _svr_features_transformations

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('./data/preprocesed_data.200.20.big.pd')
preprocessed_data.columns

# Create a dataframe
df = pd.DataFrame(preprocessed_data)


In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 114411
Number of observations in the test data: 28603


In [None]:
#get features
answer_features_transformations, \
question_features_transformations, \
time_features_transformations, \
user_features_transformations = _tree_features_transformations()

In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = False
    # Run CatBoostRegressor with hyperopt optimization
    cat = HyperoptModel(train.copy(), test.copy(), 'cat', cv=3)
    cat.raw_features = []
    cat.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_features', CustomDataFrameMapper(user_features_transformations)),
            ('time_features', CustomDataFrameMapper(time_features_transformations)),
            ('answer_features', CustomDataFrameMapper(answer_features_transformations)),
            ('question_features', CustomDataFrameMapper(question_features_transformations))
        ])),
        ('estimate', CatBoostRegressor(**{'bagging_temperature': 0.29793733267072053,
                                          'iterations': 5000,
                                          'l2_leaf_reg': 1.5511980979084095,
                                          'learning_rate': 0.08683045812519587,
                                          'loss_function': 'RMSE',
                                          'random_seed': 0,
                                          'random_strength': 20,
                                          'verbose': True,
                                          'train_dir': 'outputs/cat'}))
    ])
    
    for transformer in cat.pipeline.named_steps['prepare_features'].transformer_list:
        cat.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
    
    cat.space = {
        'estimate__iterations': hp.choice('estimate__iterations', [50]),
        'estimate__loss_function': hp.choice('estimate__loss_function', ['RMSE']),
        'estimate__train_dir': hp.choice('estimate__train_dir', ['outputs/cat']),
        'estimate__thread_count': hp.choice('estimate__thread_count', [4]),
        'estimate__used_ram_limit': hp.choice('estimate__used_ram_limit', [1024 * 1024 * 1024 * 4]),  # 4gb
        'estimate__random_seed': hp.choice('estimate__random_seed', [0]),

        'estimate__learning_rate': hp.loguniform('estimate__learning_rate', -5, 0),
        'estimate__random_strength': hp.choice('estimate__random_strength', [1, 20]),
        'estimate__l2_leaf_reg': hp.loguniform('estimate__l2_leaf_reg', 0, np.log(10)),
        'estimate__bagging_temperature': hp.uniform('estimate__bagging_temperature', 0, 1),
    }

    """
    find the best number of trees (following https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning_trees-number-docpage/#parameter-tuning_trees-number)
    need to also substitute the CatBoostRegressor parameters with:
        ('estimate', CatBoostRegressor(iterations=10000, loss_function='RMSE', auto_stop_pval=1e-4, use_best_model=True, train_dir='outputs/cat_trees', verbose=True))
    """
    # num_trees_train, num_trees_eval = train_test_split(train, test_size=0.2, random_state=0)
    # X = num_trees_train[list(filter(lambda column: column in cat.raw_features, cat.train.columns))]
    # y = num_trees_train['score']
    #
    # eval_X = num_trees_eval[list(filter(lambda column: column in cat.raw_features, cat.train.columns))]
    # eval_X = cat.pipeline.named_steps['prepare_features'].fit_transform(eval_X)
    # eval_set = (eval_X, num_trees_eval['score'])
    #
    # cat.pipeline.fit(X=X, y=y, estimate__eval_set=eval_set)
    # cat.model = cat.pipeline
    # print(cat.model.named_steps['estimate'].get_params())
    # cat.model.named_steps['estimate'].save_model('tmp/cat.model'.encode('utf-8'))

    if hyperopt:
        cat.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        cat.pipeline.fit(X=cat.X_train, y=cat.y_train)
        cat.model = cat.pipeline
        cat.stats()

        # joblib.dump(cat, './outputs/models/%s.pckl' % cat.output_prefix)
        # cat = joblib.load('./outputs/models/%s.pckl' % cat.output_prefix)

        cat.plot_predicted_vs_actual(do_lowess=False)
        cat.plot_residuals(r_type='raw', do_lowess=False)
        cat.plot_feature_importance()

Borders generated
0:	learn 0.5034394424passed: 0.229 sec	total: 1.11s	remaining: 1h 32m 20s
1:	learn 0.4617420782passed: 0.177 sec	total: 1.29s	remaining: 53m 33s
2:	learn 0.4238395137passed: 0.176 sec	total: 1.46s	remaining: 40m 35s
3:	learn 0.389409431passed: 0.189 sec	total: 1.65s	remaining: 34m 22s
4:	learn 0.3581937181passed: 0.176 sec	total: 1.83s	remaining: 30m 26s
5:	learn 0.3298876144passed: 0.176 sec	total: 2s	remaining: 27m 47s
6:	learn 0.3042402245passed: 0.179 sec	total: 2.18s	remaining: 25m 57s
7:	learn 0.2809361687passed: 0.178 sec	total: 2.36s	remaining: 24m 33s
8:	learn 0.2600170142passed: 0.183 sec	total: 2.54s	remaining: 23m 31s
9:	learn 0.2412321805passed: 0.18 sec	total: 2.72s	remaining: 22m 39s
10:	learn 0.224281573passed: 0.175 sec	total: 2.9s	remaining: 21m 55s
11:	learn 0.2090629892passed: 0.183 sec	total: 3.08s	remaining: 21m 21s
12:	learn 0.1950296232passed: 0.181 sec	total: 3.26s	remaining: 20m 52s
13:	learn 0.1823680924passed: 0.18 sec	total: 3.44s	remainin

In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = False
    # Run LGBMRegressor with hyperopt optimization
    lgbm = HyperoptModel(train.copy(), test.copy(), 'lgbm', cv=5)
    lgbm.raw_features = []
    lgbm.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_features', CustomDataFrameMapper(user_features_transformations)),
            ('time_features', CustomDataFrameMapper(time_features_transformations)),
            ('answer_features', CustomDataFrameMapper(answer_features_transformations)),
            ('question_features', CustomDataFrameMapper(question_features_transformations))
        ])),
        ('estimate', lgb.LGBMRegressor(**{'bagging_fraction': 0.9583593582453502,
                                          'feature_fraction': 0.797191970090108,
                                          'lambda_l1': 0,
                                          'lambda_l2': 0,
                                          'learning_rate': 0.06967397660277702,
                                          'min_data_in_leaf': 2,
                                          'min_sum_hessian_in_leaf': 3.8117576166032006,
                                          'n_estimators': 435,
                                          'num_leaves': 287,
                                          'objective': 'regression',
                                          'seed': 0}))
    ])
    for transformer in lgbm.pipeline.named_steps['prepare_features'].transformer_list:
        lgbm.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]

    """ find number of trees """
    # num_trees_train, num_trees_eval = train_test_split(train, test_size=0.2, random_state=0)
    # X = num_trees_train[list(filter(lambda column: column in lgbm.raw_features, lgbm.train.columns))]
    # y = num_trees_train['score']
    #
    # eval_X = num_trees_eval[list(filter(lambda column: column in lgbm.raw_features, lgbm.train.columns))]
    # eval_X = lgbm.pipeline.named_steps['prepare_features'].fit_transform(eval_X)
    # eval_set = (eval_X, num_trees_eval['score'])
    #
    # best = lgbm.pipeline.fit(X=X, y=y, estimate__eval_set=eval_set, estimate__early_stopping_rounds=10)
    # print(best.named_steps['estimate'].best_iteration)

    lgbm.space = {
        'estimate__objective': hp.choice('estimate__objective', ['regression']),
        'estimate__n_estimators': hp.choice('estimate__n_estimators', [400]),
        'estimate__seed': hp.choice('estimate__seed', [0]),

        'estimate__learning_rate': hp.loguniform('estimate__learning_rate', -7, 0),
        'estimate__num_leaves': scope.int(hp.qloguniform('estimate__num_leaves', 1, 7, 1)),
        'estimate__feature_fraction': hp.uniform('estimate__feature_fraction', 0.5, 1),
        'estimate__bagging_fraction': hp.uniform('estimate__bagging_fraction', 0.5, 1),
        'estimate__min_data_in_leaf': scope.int(hp.qloguniform('estimate__min_data_in_leaf', 0, 6, 1)),
        'estimate__min_sum_hessian_in_leaf': hp.loguniform('estimate__min_sum_hessian_in_leaf', -16, 5),
        'estimate__lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('estimate__lambda_l1_positive', -16, 2)]),
        'estimate__lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('estimate__lambda_l2_positive', -16, 2)]),
    }

    if hyperopt:
        lgbm.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lgbm.pipeline.fit(X=lgbm.X_train, y=lgbm.y_train)
        lgbm.model = lgbm.pipeline
        lgbm.stats()
        lgbm.plot_predicted_vs_actual()
        lgbm.plot_residuals(r_type='raw', do_lowess=False)
        lgbm.plot_feature_importance()

In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = False
    # Run RandomForestRegressor with hyperopt optimization
    rf = HyperoptModel(train.copy(), test.copy(),'rf', cv=3, max_evals = 10)
    rf.raw_features = []
    rf.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_features', CustomDataFrameMapper(user_features_transformations)),
            ('time_features', CustomDataFrameMapper(time_features_transformations)),
            ('answer_features', CustomDataFrameMapper(answer_features_transformations)),
            ('question_features', CustomDataFrameMapper(question_features_transformations))
        ])),
        ('estimate', RandomForestRegressor(**{'max_features': 0.5907165396346349,
                                              'min_samples_leaf': 10,
                                              'n_estimators': 2208,
                                              'oob_score': True,
                                              'random_state': 0}))
    ])
    for transformer in rf.pipeline.named_steps['prepare_features'].transformer_list:
        rf.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]

    rf.space = {
        'estimate__random_state': hp.choice('estimate__random_state', [0]),
        'estimate__oob_score': hp.choice('estimate__oob_score', [True]),

        'estimate__max_features': hp.uniform('estimate__max_features', 0, 1.),
        'estimate__n_estimators': hp.choice('estimate__n_estimators', range(1, 3000 + 1)),
        #'estimate__criterion': hp.choice('estimate__criterion', ['gini', 'entropy']),
        'estimate__min_samples_leaf': hp.choice('estimate__min_samples_leaf', range(1, 100 + 1)),
        #'estimate__scale': hp.choice('estimate__scale', [0, 1.]),
        #'estimate__normalize': hp.choice('estimate__normalize', [0, 1.]),
    }

    if hyperopt:
        rf.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        rf.pipeline.fit(X=rf.X_train, y=rf.y_train)
        rf.model = rf.pipeline
        rf.stats()
        rf.plot_predicted_vs_actual()
        rf.plot_residuals(r_type='raw', do_lowess=False)
        rf.plot_feature_importance()

In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = False
    # Run SVR with hyperopt optimization
    answer_features_transformations, question_features_transformations, time_features_transformations, user_features_transformations = _svr_features_transformations()

    svm = HyperoptModel(train.copy(), test.copy(), 'svr', cv=3)
    svm.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_features', CustomDataFrameMapper(user_features_transformations)),
            ('time_features', CustomDataFrameMapper(time_features_transformations)),
            ('answer_features', CustomDataFrameMapper(answer_features_transformations)),
            ('question_features', CustomDataFrameMapper(question_features_transformations))
        ])),
        ('estimate', SVR(C=3.376124349816575, gamma=0.0069678844996990535, kernel='rbf'))
    ])
    svm.raw_features = []
    for transformer in svm.pipeline.named_steps['prepare_features'].transformer_list:
        svm.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]

    """
    1) The C parameter trades off misclassification of training examples against simplicity of the decision surface. 
    A low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly 
        by giving the model freedom to select more samples as support vectors.
        
    2)  The gamma parameter defines how far the influence of a single training example reaches, 
    with low values meaning ‘far’ and high values meaning ‘close’. 
    The gamma parameters can be seen as the inverse of the 
        radius of influence of samples selected by the model as support vectors.
    """

    # default_gamma = 1. / len(svm.raw_features)
    svm.space = {
        'estimate__C': hp.uniform('estimate__C', 0, 10.),
        'estimate__kernel': hp.choice('estimate__kernel', ['linear', 'sigmoid', 'rbf']),
        'estimate__gamma': hp.uniform('estimate__gamma', 0, 10.),
    }

    if hyperopt:
        svm.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        svm.pipeline.fit(X=svm.X_train, y=svm.y_train)
        svm.model = svm.pipeline
        svm.stats()
        svm.plot_predicted_vs_actual()
        svm.plot_residuals(r_type='raw', do_lowess=False)
        svm.plot_feature_importance()

In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = False
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=3, max_evals=100)
    nn.raw_features = []

    nn.pipeline = Pipeline([
         ('prepare_features', FeatureUnion([
            ('user_features', CustomDataFrameMapper(user_features_transformations)),
            ('time_features', CustomDataFrameMapper(time_features_transformations)),
            ('answer_features', CustomDataFrameMapper(answer_features_transformations)),
            ('question_features', CustomDataFrameMapper(question_features_transformations))
        ])),
        ('estimate', MLPRegressor())
    ])
    
    for transformer in nn.pipeline.named_steps['prepare_features'].transformer_list:
        nn.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
    


    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.00001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), # 'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : (scope.int(hp.uniform('estimate__first_layer', 1, 100)), scope.int(hp.uniform('estimate__second_layer', 1, 100))),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = nn.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()