In [1]:
#imports
import lightgbm as lgb
import numpy as np
import pandas as pd
import sqlalchemy
from catboost import CatBoostRegressor
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from catboost import CatBoostRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel
from models.utils import CustomDataFrameMapper
from models.utils import get_features

  from pandas.core import datetools


In [2]:
# Create an object called iris with the iris data
preprocessed_data = pd.read_pickle('./data/preprocesed_data.pd')

# Create a dataframe with the four feature variables
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 9303
Number of observations in the test data: 2326


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run CatBoostRegressor with hyperopt optimization
    cat = HyperoptModel(train.copy(), test.copy(),'cat', cv=3)
    cat.raw_features = []
    comment_features, mention_features, post_features, user_features = get_features()

    cat.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_stats', user_features),
            ('mention_stats', mention_features),
            ('posts_stats', post_features),
            ('comments_stats', comment_features)
        ])),
        ('estimate', CatBoostRegressor(**{'bagging_temperature': 0.982256905186019,
                                          'iterations': 1300,
                                          'l2_leaf_reg': 2.2883628406963057,
                                          'learning_rate': 0.03533373267651768,
                                          'loss_function': 'RMSE',
                                          'random_seed': 0,
                                          'random_strength': 1,
                                          'thread_count': 4,
                                          'train_dir': 'outputs/cat'}))
    ])
    
    for transformer in cat.pipeline.named_steps['prepare_features'].transformer_list:
        cat.raw_features += [t[0][0] for t in transformer[1].features]
   
    print(cat.raw_features)
    cat.space = {
        'estimate__iterations': hp.choice('estimate__iterations', [1300]),
        'estimate__loss_function': hp.choice('estimate__loss_function', ['RMSE']),
        'estimate__train_dir': hp.choice('estimate__train_dir', ['outputs/cat']),
        'estimate__thread_count': hp.choice('estimate__thread_count', [4]),
        'estimate__used_ram_limit': hp.choice('estimate__used_ram_limit', [1024 * 1024 * 1024 * 4]),  # 4gb
        'estimate__random_seed': hp.choice('estimate__random_seed', [0]),

        'estimate__learning_rate': hp.loguniform('estimate__learning_rate', -5, 0),
        'estimate__random_strength': hp.choice('estimate__random_strength', [1, 20]),
        'estimate__l2_leaf_reg': hp.loguniform('estimate__l2_leaf_reg', 0, np.log(10)),
        'estimate__bagging_temperature': hp.uniform('estimate__bagging_temperature', 0, 1),
    }

    """
    find the best number of trees (following https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning_trees-number-docpage/#parameter-tuning_trees-number)
    need to also substitute the CatBoostRegressor parameters with:
        ('estimate', CatBoostRegressor(iterations=10000, loss_function='RMSE', auto_stop_pval=1e-4, use_best_model=True, train_dir='outputs/cat_trees', verbose=True))
    """
    # num_trees_train, num_trees_eval = train_test_split(train, test_size=0.2, random_state=0)
    # X = num_trees_train[list(filter(lambda column: column in cat.raw_features, cat.train.columns))]
    # y = num_trees_train['score']
    #
    # eval_X = num_trees_eval[list(filter(lambda column: column in cat.raw_features, cat.train.columns))]
    # eval_X = cat.pipeline.named_steps['prepare_features'].fit_transform(eval_X)
    # eval_set = (eval_X, num_trees_eval['score'])
    #
    # cat.pipeline.fit(X=X, y=y, estimate__eval_set=eval_set)
    # cat.model = cat.pipeline
    # print(cat.model.named_steps['estimate'].get_params())
    # cat.model.named_steps['estimate'].save_model('tmp/cat.model'.encode('utf-8'))

    if hyperopt:
        cat.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        cat.pipeline.fit(X=cat.X_train, y=cat.y_train)
    cat.model = cat.pipeline
    cat.stats()
    cat.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
    cat.plot_feature_importance()

['user_friends_num', 'highest_education_level', 'education_is_present', 'languages_num', 'language_info_is_present', 'unique_mention_authors_per_friend', 'mentions_per_friend', 'user_friends_per_post', 'user_media_to_all_normal_ratio', 'user_normal_posts_num', 'user_life_events_num', 'user_small_posts_num', 'avg_normal_post_length', 'user_comments_num', 'avg_comment_length', 'user_likes_per_comment', 'comments_on_own_posts_num', 'comments_on_own_life_events_num']
Performing parameters optimization...
[1/1]	cv_eval_time=138.58 sec	RMSE=0.078741	R^2=0.694653
 elapsed time: 2min 18s

Stats (train | test):
	R^2 score:		0.7213
					0.7191
	RMSE:			0.0757
					0.0761
	Mean error:		0.0581
					0.0587

Best parameters set:
{'estimate__bagging_temperature': 0.87860092695984,
 'estimate__iterations': 1300,
 'estimate__l2_leaf_reg': 1.8859278353041455,
 'estimate__learning_rate': 0.007875631617459423,
 'estimate__loss_function': 'RMSE',
 'estimate__random_seed': 0,
 'estimate__random_strength': 2