In [5]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel
from models.utils import CustomDataFrameMapper, get_features, get_svr_features
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr,  spearmanr,  kendalltau

In [6]:
# read the data
preprocessed_data = pd.read_pickle('./data/preprocesed_data.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 9303
Number of observations in the test data: 2326


In [5]:
    # Run CatBoostRegressor with hyperopt optimization
    DO_LOWESS = False
    hyperopt = False
    # Run CatBoostRegressor with hyperopt optimization
    cat = HyperoptModel(train.copy(), test.copy(),'cat', cv=3)
    cat.raw_features = []
    comment_features, mention_features, post_features, user_features = get_features()

    cat.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_stats', user_features),
            ('mention_stats', mention_features),
            ('posts_stats', post_features),
            ('comments_stats', comment_features)
        ])),
        ('estimate', CatBoostRegressor(**{'bagging_temperature': 0.7581941022055045,
                                          'iterations': 1300,
                                          'l2_leaf_reg': 3.3369699206218777,
                                          'learning_rate': 0.03533373267651768,
                                          'loss_function': 'RMSE',
                                          'random_seed': 0,
                                          'random_strength': 1,
                                          'thread_count': 4,
                                          'train_dir': 'outputs/cat'}))
    ])
    
    
    for transformer in cat.pipeline.named_steps['prepare_features'].transformer_list:
        cat.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
   
    cat.space = {
        'estimate__iterations': hp.choice('estimate__iterations', [1300]),
        'estimate__loss_function': hp.choice('estimate__loss_function', ['RMSE']),
        'estimate__train_dir': hp.choice('estimate__train_dir', ['outputs/cat']),
        'estimate__thread_count': hp.choice('estimate__thread_count', [4]),
        'estimate__used_ram_limit': hp.choice('estimate__used_ram_limit', [1024 * 1024 * 1024 * 4]),  # 4gb
        'estimate__random_seed': hp.choice('estimate__random_seed', [0]),

        'estimate__learning_rate': hp.loguniform('estimate__learning_rate', -5, 0),
        'estimate__random_strength': hp.choice('estimate__random_strength', [1, 20]),
        'estimate__l2_leaf_reg': hp.loguniform('estimate__l2_leaf_reg', 0, np.log(10)),
        'estimate__bagging_temperature': hp.uniform('estimate__bagging_temperature', 0, 1),
    }

    """
    find the best number of trees (following https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning_trees-number-docpage/#parameter-tuning_trees-number)
    need to also substitute the CatBoostRegressor parameters with:
        ('estimate', CatBoostRegressor(iterations=10000, loss_function='RMSE', auto_stop_pval=1e-4, use_best_model=True, train_dir='outputs/cat_trees', verbose=True))
    """
    # num_trees_train, num_trees_eval = train_test_split(train, test_size=0.2, random_state=0)
    # X = num_trees_train[list(filter(lambda column: column in cat.raw_features, cat.train.columns))]
    # y = num_trees_train['score']
    #
    # eval_X = num_trees_eval[list(filter(lambda column: column in cat.raw_features, cat.train.columns))]
    # eval_X = cat.pipeline.named_steps['prepare_features'].fit_transform(eval_X)
    # eval_set = (eval_X, num_trees_eval['score'])
    #
    # cat.pipeline.fit(X=X, y=y, estimate__eval_set=eval_set)
    # cat.model = cat.pipeline
    # print(cat.model.named_steps['estimate'].get_params())
    # cat.model.named_steps['estimate'].save_model('tmp/cat.model'.encode('utf-8'))

    if hyperopt:
        cat.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        cat.pipeline.fit(X=cat.X_train, y=cat.y_train)
        cat.model = cat.pipeline
        cat.stats()
        cat.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        cat.plot_feature_importance()
        cat.qq_plot()

Stats (train | test):
	R^2 score:		0.8565
					0.8021
	RMSE:			0.0543
					0.0639
	Mean error:		0.0413
					0.0489
	Pearson:		0.9260
					0.8956
	Spearman:		0.9140
					0.8829
	KendallTau:		0.7526
					0.7082

Plotting predicted vs. actual ...done

[14.627768243851087, 0.26611010965975174, 0.2280542307257879, 0.14958020618385887, 0.1810367164379663, 0.35266106966566646, 0.393718394698887, 1.3765718623294974, 23.610246490702558, 11.847929726658172, 2.2174420422462084, 2.138930239388131, 4.42185213172175, 2.5831550972605215, 1.9170008067008786, 6.3582630385605, 26.31305854367411, 1.0166210495346484]
Plotting feature importances ...done

Plotting QQ ...done



In [6]:
    # Run LGBMRegressor with hyperopt optimization
    DO_LOWESS = False
    hyperopt = False
    lgbm = HyperoptModel(train.copy(), test.copy(), 'lgbm', cv=5)
    comment_features, mention_features, post_features, user_features = get_features()
    lgbm.raw_features = []
    lgbm.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_stats', user_features),
            ('mention_stats', mention_features),
            ('posts_stats', post_features),
            ('comments_stats', comment_features)
        ])),
        ('estimate', lgb.LGBMRegressor(**{'bagging_fraction': 0.7394781981136033,
                                          'feature_fraction': 0.9385660204777418,
                                          'lambda_l1': 0.020835973532181865,
                                          'lambda_l2': 0.03941529177232025,
                                          'learning_rate': 0.035919305210976936,
                                          'min_data_in_leaf': 17,
                                          'min_sum_hessian_in_leaf': 1.3190919274009395e-05,
                                          'n_estimators': 400,
                                          'num_leaves': 33,
                                          'objective': 'regression',
                                          'seed': 0}))
    ])
    for transformer in lgbm.pipeline.named_steps['prepare_features'].transformer_list:
        lgbm.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
   
    """ find number of trees """
    # num_trees_train, num_trees_eval = train_test_split(train, test_size=0.2, random_state=0)
    # X = num_trees_train[list(filter(lambda column: column in lgbm.raw_features, lgbm.train.columns))]
    # y = num_trees_train['score']
    #
    # eval_X = num_trees_eval[list(filter(lambda column: column in lgbm.raw_features, lgbm.train.columns))]
    # eval_X = lgbm.pipeline.named_steps['prepare_features'].fit_transform(eval_X)
    # eval_set = (eval_X, num_trees_eval['score'])
    #
    # best = lgbm.pipeline.fit(X=X, y=y, estimate__eval_set=eval_set, estimate__early_stopping_rounds=10)
    # print(best.named_steps['estimate'].best_iteration)

    lgbm.space = {
        'estimate__objective': hp.choice('estimate__objective', ['regression']),
        'estimate__n_estimators': hp.choice('estimate__n_estimators', [400]),
        'estimate__seed': hp.choice('estimate__seed', [0]),

        'estimate__learning_rate': hp.loguniform('estimate__learning_rate', -7, 0),
        'estimate__num_leaves': scope.int(hp.qloguniform('estimate__num_leaves', 1, 7, 1)),
        'estimate__feature_fraction': hp.uniform('estimate__feature_fraction', 0.5, 1),
        'estimate__bagging_fraction': hp.uniform('estimate__bagging_fraction', 0.5, 1),
        'estimate__min_data_in_leaf': scope.int(hp.qloguniform('estimate__min_data_in_leaf', 0, 6, 1)),
        'estimate__min_sum_hessian_in_leaf': hp.loguniform('estimate__min_sum_hessian_in_leaf', -16, 5),
        'estimate__lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('estimate__lambda_l1_positive', -16, 2)]),
        'estimate__lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('estimate__lambda_l2_positive', -16, 2)]),
    }

    if hyperopt:
        lgbm.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lgbm.pipeline.fit(X=lgbm.X_train, y=lgbm.y_train)
        lgbm.model = lgbm.pipeline
        lgbm.stats()
        lgbm.plot_predicted_vs_actual()
        lgbm.plot_feature_importance()

Stats (train | test):
	R^2 score:		0.8830
					0.7929
	RMSE:			0.0490
					0.0653
	Mean error:		0.0376
					0.0501
	Pearson:		0.9405
					0.8905
	Spearman:		0.9309
					0.8779
	KendallTau:		0.7778
					0.7014

Plotting predicted vs. actual ...done

Plotting feature importances ...done



In [7]:
    #parameters
    DO_LOWESS = False
    hyperopt = False
    # Run RandomForestRegressor with hyperopt optimization
    rf = HyperoptModel(train.copy(), test.copy(), 'rf', cv=3)
    rf.raw_features = []
    comment_features, mention_features, post_features, user_features = get_features()
    rf.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_stats', user_features),
            ('mention_stats', mention_features),
            ('posts_stats', post_features),
            ('comments_stats', comment_features)
        ])),
        ('estimate', RandomForestRegressor(**{'max_features': 0.5954111977396234,
                                              'min_samples_leaf': 1,
                                              'n_estimators': 388,
                                              'oob_score': True,
                                              'random_state': 0}))
    ])
    
    
    for transformer in rf.pipeline.named_steps['prepare_features'].transformer_list:
        rf.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
     # for transformer in rf.pipeline.named_steps['prepare_features'].transformer_list:
      #    rf.raw_features += [t if isinstance(t, basestring) else t[0] for t, _, _ in transformer[1].features]
    #print(rf.get_params().keys())
    rf.space = {
        'estimate__random_state': hp.choice('estimate__random_state', [0]),
        'estimate__oob_score': hp.choice('estimate__oob_score', [True]),

        'estimate__max_features': hp.uniform('estimate__max_features', 0, 1.),
        'estimate__n_estimators': hp.choice('estimate__n_estimators', range(1, 3000 + 1)),
        'estimate__min_samples_leaf': hp.choice('estimate__min_samples_leaf', range(1, 100 + 1)),
        #'estimate__scale': hp.choice('estimate__scale', [0, 1.]),
        #'estimate__normalize': hp.choice('estimate__normalize', [0, 1.]),
    }

    if hyperopt:
        rf.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        rf.pipeline.fit(X=rf.X_train, y=rf.y_train)
        rf.model = rf.pipeline
        rf.stats()
        rf.plot_predicted_vs_actual()
        rf.plot_feature_importance()

Stats (train | test):
	R^2 score:		0.9651
					0.7557
	RMSE:			0.0268
					0.0710
	Mean error:		0.0203
					0.0544
	Pearson:		0.9857
					0.8708
	Spearman:		0.9845
					0.8594
	KendallTau:		0.8960
					0.6770

Plotting predicted vs. actual ...done

Plotting feature importances ...done



In [8]:
    # Run SVM with hyperopt optimization
    DO_LOWESS = False
    hyperopt = False
    
    comment_features, mention_features, post_features, user_features = get_svr_features()
    svm = HyperoptModel(train.copy(), test.copy(), 'svr', cv=3)
    svm.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_stats', user_features),
            ('mention_stats', mention_features),
            ('posts_stats', post_features),
            # ('comments_stats', comment_features)
        ])),
        ('estimate', SVR(**{'C': 0.48823921084665933,
                            'epsilon': 0.06249804864583549,
                            'gamma': 0.1606868912867299,
                            'kernel': 'rbf'}))
    ])
    
    svm.raw_features = []
    for transformer in svm.pipeline.named_steps['prepare_features'].transformer_list:
        svm.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
   
    svm.space = {
        'estimate__C': hp.uniform('estimate__C', 0, 10.),
        'estimate__kernel': hp.choice('estimate__kernel', ['linear', 'sigmoid', 'rbf']),
        'estimate__gamma': hp.uniform('estimate__gamma', 0, 10.),
        #'estimate__scale': hp.choice('estimate__scale', [0, 1.]),
        #'estimate__normalize': hp.choice('estimate__normalize', [0, 1.]),
    }
    
    if hyperopt:
        svm.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        svm.pipeline.fit(X=svm.X_train, y=svm.y_train)
        svm.model = svm.pipeline
        svm.stats()
        svm.plot_predicted_vs_actual()
        svm.plot_feature_importance()



Stats (train | test):
	R^2 score:		0.6136
					0.5432
	RMSE:			0.0891
					0.0970
	Mean error:		0.0682
					0.0741
	Pearson:		0.7835
					0.7371
	Spearman:		0.7619
					0.7168
	KendallTau:		0.5804
					0.5316

Plotting predicted vs. actual ...done



In [9]:
    #some parameters
    DO_LOWESS = False
    hyperopt = False
    # Run MLPRegressor with hyperopt optimization
    comment_features, mention_features, post_features, user_features = get_features()
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=3, max_evals=2)
    nn.raw_features = []

    nn.pipeline = Pipeline([
            ('prepare_features', FeatureUnion([
            ('user_stats', user_features),
            ('mention_stats', mention_features),
            ('posts_stats', post_features),
            ('comments_stats', comment_features)
        ])),
        ('estimate', MLPRegressor(**{'alpha': 0.00189292968732814,
                                     'activation': 'logistic',
                                     'hidden_layer_sizes': (88,),
                                     'solver': 'adam'    }))
    ])
    
    for transformer in nn.pipeline.named_steps['prepare_features'].transformer_list:
        nn.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
   

    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.00001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), # 'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : (scope.int(hp.uniform('estimate__first_layer', 1, 100)),scope.int(hp.uniform('estimate__second_layer', 1, 100))) ,
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = nn.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Stats (train | test):
	R^2 score:		0.6667
					0.6471
	RMSE:			0.0828
					0.0853
	Mean error:		0.0653
					0.0670
	Pearson:		0.8169
					0.8054
	Spearman:		0.7894
					0.7766
	KendallTau:		0.6005
					0.5891

Plotting predicted vs. actual ...done

Plotting QQ ...done



In [4]:
    #Polynomial linear regression 
    degrees = 4
    for d in range(1, degrees+1):
            print("Degree: %s" % d)
            # Create the model, split the sets and fit it
            polynomial_features = PolynomialFeatures(
                degree=d, include_bias=False
            )
            linear_regression = linear_model.LinearRegression()
            model = Pipeline([
                ("polynomial_features", polynomial_features),
                ("linear_regression", linear_regression)
            ])
            dfPoly = df.copy()
            y = dfPoly['score'].copy()
            dfPoly.drop('score', axis=1, inplace=True) 

            # create training and testing vars
            X_train, X_test, y_train, y_test = train_test_split(dfPoly, y, test_size=0.2)

            model.fit(X_train, y_train)
            # Calculate the metrics
            train_prediction = model.predict(X_train)
            test_prediction = model.predict(X_test)
            
            print('Polynomial degree: {}'.format(d))
            print('Stats (train | test):')
            print('\tR^2 score:\t\t%.4f\n\t\t\t\t\t%.4f' % (r2_score(y_train, train_prediction),
                                                        r2_score(y_test, test_prediction)))
            print('\tRMSE:\t\t\t%.4f\n\t\t\t\t\t%.4f' % (mean_squared_error(y_train, train_prediction) ** 0.5,
                                                     mean_squared_error(y_test, test_prediction) ** 0.5))
            print('\tMean error:\t\t%.4f\n\t\t\t\t\t%.4f' % (mean_absolute_error(y_train, train_prediction),
                                                         mean_absolute_error(y_test, test_prediction)))
            print('\tPearson:\t\t%.4f\n\t\t\t\t\t%.4f' % (pearsonr(y_train, train_prediction)[0],
                                                         pearsonr(y_test, test_prediction)[0]))
            print('\tSpearman:\t\t%.4f\n\t\t\t\t\t%.4f' % (spearmanr(y_train, train_prediction)[0],
                                                         spearmanr(y_test, test_prediction)[0]))
            print('\tKendallTau:\t\t%.4f\n\t\t\t\t\t%.4f' % (kendalltau(y_train, train_prediction)[0],
                                                         kendalltau(y_test, test_prediction)[0]))
            

Degree: 1
Polynomial degree: 1
Stats (train | test):
	R^2 score:		0.5113
					0.5536
	RMSE:			0.1008
					0.0939
	Mean error:		0.0724
					0.0701
	Pearson:		0.7150
					0.7441
	Spearman:		0.8123
					0.8179
	KendallTau:		0.6269
					0.6307
Degree: 2
Polynomial degree: 2
Stats (train | test):
	R^2 score:		0.7627
					0.6012
	RMSE:			0.0696
					0.0920
	Mean error:		0.0484
					0.0532
	Pearson:		0.8734
					0.7863
	Spearman:		0.9141
					0.8977
	KendallTau:		0.7529
					0.7367
Degree: 3
Polynomial degree: 3
Stats (train | test):
	R^2 score:		0.8853
					-118.4447
	RMSE:			0.0488
					1.5300
	Mean error:		0.0331
					0.1629
	Pearson:		0.9410
					0.1134
	Spearman:		0.9583
					0.8091
	KendallTau:		0.8338
					0.6715
Degree: 4
Polynomial degree: 4
Stats (train | test):
	R^2 score:		0.9699
					-362283208.4452
	RMSE:			0.0249
					2717.0577
	Mean error:		0.0137
					104.7707
	Pearson:		0.9849
					0.0190
	Spearman:		0.9895
					0.3737
	KendallTau:		0.9247
					0.2924


Index(['user_friends_num', 'user_likes_per_post', 'user_friends_per_post',
       'user_media_to_all_normal_ratio', 'user_normal_posts_num',
       'user_life_events_num', 'user_small_posts_num',
       'avg_normal_post_length', 'user_comments_num', 'avg_comment_length',
       'user_likes_per_comment', 'comments_on_own_posts_num',
       'comments_on_own_life_events_num', 'highest_education_level',
       'education_is_present', 'languages_num', 'language_info_is_present',
       'unique_mention_authors', 'mentions_num', 'own_mentions_num',
       'unique_mention_authors_per_friend', 'mentions_per_friend',
       'own_mentions_per_friend'],
      dtype='object')
