In [1]:
#imports
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from math import sqrt
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from catboost import CatBoostRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, FeatureUnion
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from models.utils import CustomDataFrameMapper
from models.utils import get_features_twitter_qualitative

from models.hyperopt_model import HyperoptModel
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr,  spearmanr,  kendalltau

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_qualitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)
print(df.columns)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
reply_features, retweet_features, tweet_features, user_features = get_features_twitter_qualitative()

In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Linear Regression with hyperopt optimization
    linear = HyperoptModel(train.copy(), test.copy(),'linear', cv=4, max_evals = 500)
    linear.raw_features = []
    linear.pipeline = Pipeline([
          ('prepare_features', FeatureUnion([
            ('user_features', user_features),
            ('tweet_features', tweet_features),
            ('retweet_features', retweet_features),
            ('reply_features', reply_features)
        ])),
        ('estimate', linear_model.LinearRegression())
    ])
    
    for transformer in linear.pipeline.named_steps['prepare_features'].transformer_list:
        linear.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
        
    linear.space = {
        'estimate__fit_intercept': hp.choice('estimate__fit_intercept', ['True', 'False']),
    }

    if hyperopt:
        linear.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        linear.pipeline.fit(X=linear.X_train, y=linear.y_train)
        linear.model = linear.pipeline
        linear.stats()
        linear.plot_feature_importance()
        linear.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        linear.qq_plot()

In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = False
    # Run Lasso Regression with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=4, max_evals = 500)
    lasso.raw_features = []
    lasso.pipeline = Pipeline([
          ('prepare_features', FeatureUnion([
            ('user_features', user_features),
            ('tweet_features', tweet_features),
            ('retweet_features', retweet_features),
            ('reply_features', reply_features)
        ])),
        ('estimate', linear_model.Lasso(**{'alpha': 0.0006577184991258585}))
    ])
    
    for transformer in lasso.pipeline.named_steps['prepare_features'].transformer_list:
        lasso.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
        
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = lasso.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    ridge.raw_features = []

    ridge.pipeline = Pipeline([
         ('prepare_features', FeatureUnion([
            ('user_features', user_features),
            ('tweet_features', tweet_features),
            ('retweet_features', retweet_features),
            ('reply_features', reply_features)
        ])),
        ('estimate', linear_model.Ridge(**{'alpha': 0.05231780585024858}))
    ])
    
    for transformer in ridge.pipeline.named_steps['prepare_features'].transformer_list:
        ridge.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
        
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=ridge.X_train, y=ridge.y_train)
        ridge.model = ridge.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=3, max_evals=500)
    nn.raw_features = []

    nn.pipeline = Pipeline([
          ('prepare_features', FeatureUnion([
            ('user_features', user_features),
            ('tweet_features', tweet_features),
            ('retweet_features', retweet_features),
            ('reply_features', reply_features)
        ])),
        ('estimate', MLPRegressor())
    ])
    
    for transformer in nn.pipeline.named_steps['prepare_features'].transformer_list:
        nn.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
        
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), # 'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

In [6]:
    #some parameters
    DO_LOWESS = False
    hyperopt = False
    # Run CatBoostRegressor with hyperopt optimization
    cat = HyperoptModel(train.copy(), test.copy(),'cat', cv=3, max_evals = 30)
    cat.raw_features = []
    cat.pipeline = Pipeline([
          ('prepare_features', FeatureUnion([
            ('user_features', user_features),
            ('tweet_features', tweet_features),
            ('retweet_features', retweet_features),
            ('reply_features', reply_features)
        ])),
        ('estimate', CatBoostRegressor())
    ])
     
    for transformer in cat.pipeline.named_steps['prepare_features'].transformer_list:
        cat.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
        
    cat.space = {
        'estimate__iterations': hp.choice('estimate__iterations', [1300]),
        'estimate__loss_function': hp.choice('estimate__loss_function', ['RMSE']),
        'estimate__train_dir': hp.choice('estimate__train_dir', ['outputs/cat']),
        'estimate__thread_count': hp.choice('estimate__thread_count', [4]),
        'estimate__used_ram_limit': hp.choice('estimate__used_ram_limit', [1024 * 1024 * 1024 * 4]),  # 4gb
        'estimate__random_seed': hp.choice('estimate__random_seed', [0]),

        'estimate__learning_rate': hp.loguniform('estimate__learning_rate', -5, 0),
        'estimate__random_strength': hp.choice('estimate__random_strength', [1, 20]),
        'estimate__l2_leaf_reg': hp.loguniform('estimate__l2_leaf_reg', 0, np.log(10)),
        'estimate__bagging_temperature': hp.uniform('estimate__bagging_temperature', 0, 1),
    }

    """
    find the best number of trees (following https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning_trees-number-docpage/#parameter-tuning_trees-number)
    need to also substitute the CatBoostRegressor parameters with:
        ('estimate', CatBoostRegressor(iterations=10000, loss_function='RMSE', auto_stop_pval=1e-4, use_best_model=True, train_dir='outputs/cat_trees', verbose=True))
    """
    # num_trees_train, num_trees_eval = train_test_split(train, test_size=0.2, random_state=0)
    # X = num_trees_train[list(filter(lambda column: column in cat.raw_features, cat.train.columns))]
    # y = num_trees_train['score']
    #
    # eval_X = num_trees_eval[list(filter(lambda column: column in cat.raw_features, cat.train.columns))]
    # eval_X = cat.pipeline.named_steps['prepare_features'].fit_transform(eval_X)
    # eval_set = (eval_X, num_trees_eval['score'])
    #
    # cat.pipeline.fit(X=X, y=y, estimate__eval_set=eval_set)
    # cat.model = cat.pipeline
    # print(cat.model.named_steps['estimate'].get_params())
    # cat.model.named_steps['estimate'].save_model('tmp/cat.model'.encode('utf-8'))

    if hyperopt:
        cat.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        cat.pipeline.fit(X=cat.X_train, y=cat.y_train)
        cat.model = cat.pipeline
        cat.stats()
        cat.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        cat.plot_feature_importance()
        #cat.qq_plot()

Stats (train | test):
	R^2 score:		0.5490
					0.3172
	RMSE:			0.0133
					0.0139
	Mean error:		0.0038
					0.0042
	Pearson:		0.7566
					0.5726
	Spearman:		0.6162
					0.5716
	KendallTau:		0.4665
					0.4339

Plotting predicted vs. actual ...done

[12.988011312458314, 22.699932324286472, 0.9508206146619843, 1.0363967959142244, 0.4416664732108111, 29.378013370058024, 8.56213259896625, 4.233722513273209, 3.8215911298931275, 6.959632954154819, 5.506524592517641, 1.380596137564508, 1.2420108904737448, 0.7989482925668469]
Plotting feature importances ...done



In [None]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run LGBMRegressor with hyperopt optimization
    lgbm = HyperoptModel(train.copy(), test.copy(), 'lgbm', cv=3, max_evals=50)
    lgbm.raw_features = []
    lgbm.pipeline = Pipeline([
          ('prepare_features', FeatureUnion([
            ('user_features', user_features),
            ('tweet_features', tweet_features),
            ('retweet_features', retweet_features),
            ('reply_features', reply_features)
        ])),
        ('estimate', lgb.LGBMRegressor())
    ])
    

    
    for transformer in lgbm.pipeline.named_steps['prepare_features'].transformer_list:
        lgbm.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
    """ find number of trees """
    # num_trees_train, num_trees_eval = train_test_split(train, test_size=0.2, random_state=0)
    # X = num_trees_train[list(filter(lambda column: column in lgbm.raw_features, lgbm.train.columns))]
    # y = num_trees_train['score']
    #
    # eval_X = num_trees_eval[list(filter(lambda column: column in lgbm.raw_features, lgbm.train.columns))]
    # eval_X = lgbm.pipeline.named_steps['prepare_features'].fit_transform(eval_X)
    # eval_set = (eval_X, num_trees_eval['score'])
    #
    # best = lgbm.pipeline.fit(X=X, y=y, estimate__eval_set=eval_set, estimate__early_stopping_rounds=10)
    # print(best.named_steps['estimate'].best_iteration)

    lgbm.space = {
        'estimate__objective': hp.choice('estimate__objective', ['regression']),
        'estimate__n_estimators': hp.choice('estimate__n_estimators', [400]),
        'estimate__seed': hp.choice('estimate__seed', [0]),

        'estimate__learning_rate': hp.loguniform('estimate__learning_rate', -7, 0),
        'estimate__num_leaves': scope.int(hp.qloguniform('estimate__num_leaves', 1, 7, 1)),
        'estimate__feature_fraction': hp.uniform('estimate__feature_fraction', 0.5, 1),
        'estimate__bagging_fraction': hp.uniform('estimate__bagging_fraction', 0.5, 1),
        'estimate__min_data_in_leaf': scope.int(hp.qloguniform('estimate__min_data_in_leaf', 0, 6, 1)),
        'estimate__min_sum_hessian_in_leaf': hp.loguniform('estimate__min_sum_hessian_in_leaf', -16, 5),
        'estimate__lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('estimate__lambda_l1_positive', -16, 2)]),
        'estimate__lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('estimate__lambda_l2_positive', -16, 2)]),
    }

    if hyperopt:
        lgbm.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lgbm.pipeline.fit(X=lgbm.X_train, y=lgbm.y_train)
        lgbm.model = lgbm.pipeline
        lgbm.stats()
        lgbm.plot_predicted_vs_actual()
        lgbm.plot_feature_importance()

In [None]:
    #Polynomial linear regression 
    degrees = 4
    for d in range(1, degrees+1):
            print("Degree: %s" % d)
            # Create the model, split the sets and fit it
            polynomial_features = PolynomialFeatures(
                degree=d, include_bias=False
            )
            polynomial = HyperoptModel(train.copy(), test.copy(),'poly'+str(d), cv=4, max_evals = 1)
            polynomial.raw_features = []
            polynomial.pipeline = Pipeline([
          ('prepare_features', FeatureUnion([
            ('user_features', user_features),
            ('tweet_features', tweet_features),
            ('retweet_features', retweet_features),
            ('reply_features', reply_features)
        ])),
         ("polynomial_features", polynomial_features),
        ('estimate', linear_model.LinearRegression())
    ])
    
            for transformer in polynomial.pipeline.named_steps['prepare_features'].transformer_list:
                polynomial.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
            polynomial.space = {
        'estimate__fit_intercept': hp.choice('estimate__fit_intercept', ['True', 'False']),
    }

            if hyperopt:
                polynomial.run(do_lowess=DO_LOWESS)
            else:
                # train with default params
                polynomial.pipeline.fit(X=polynomial.X_train, y=polynomial.y_train)
                polynomial.model = polynomial.pipeline
                polynomial.stats()
                polynomial.plot_feature_importance()
                polynomial.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
                polynomial.qq_plot()