In [7]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

In [8]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_hindex.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)
print(df.columns)

Index(['NUM_FIRST_POS', 'NUM_SECOND_POS', 'NUM_YEARS_SINCE_FIRST_PUBLICATION',
       'NUM_YEARS_BETWEEN_FIRST_AND_LAST_PUBLICATION', 'NUM_INSTITUTIONS',
       'NUM_TOP500_INSTITUTIONS', 'SHANGHAI_RANK', 'NTU_RANK', 'THE_RANK',
       'SHANGHAI_SCORE', 'NTU_SCORE', 'THE_SCORE', 'AVG_TITLE_LENGTH',
       'AVG_ABSTRACT_LENGTH', 'COLLAB_DEGREE_UNWEIGHTED',
       'COLLAB_DEGREE_WEIGHTED', 'COLLAB_CLOSENESS_UNWEIGHTED',
       'COLLAB_CLOSENESS_WEIGHTED', 'COLLAB_BETWEENNESS_UNWEIGHTED',
       'COLLAB_BETWEENNESS_WEIGHTED', 'COLLAB_PAGERANK_UNWEIGHTED',
       'COLLAB_PAGERANK_WEIGHTED', 'COLLAB_EIGENVECTOR_UNWEIGHTED',
       'COLLAB_EIGENVECTOR_WEIGHTED', 'CIT_OUTDEGREE_UNWEIGHTED',
       'CIT_OUTDEGREE_WEIGHTED', 'CIT_CLOSENESS_UNWEIGHTED',
       'CIT_CLOSENESS_WEIGHTED', 'CIT_BETWEENNESS_UNWEIGHTED',
       'CIT_BETWEENNESS_WEIGHTED', 'score', 'TOP_SIM_CORPUS',
       'TOP_SIM_UNIFORM', 'TOP_SIM_PAPERS', 'NUM_TOPICS_GREATER_CORPUS',
       'NUM_TOPICS_GREATER_UNIFORM'],
      dtyp

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.0006577184991258585}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = lasso.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.54 sec	RMSE=0.146802	R^2=0.325093
[2/500]	cv_eval_time=0.83 sec	RMSE=0.148608	R^2=0.308868
[3/500]	cv_eval_time=0.82 sec	RMSE=0.148285	R^2=0.311348
[4/500]	cv_eval_time=0.82 sec	RMSE=0.148324	R^2=0.311033
[5/500]	cv_eval_time=0.83 sec	RMSE=0.151504	R^2=0.281101
[6/500]	cv_eval_time=0.79 sec	RMSE=0.148890	R^2=0.305730
[7/500]	cv_eval_time=0.76 sec	RMSE=0.149660	R^2=0.298199
[8/500]	cv_eval_time=0.87 sec	RMSE=0.148126	R^2=0.312458
[9/500]	cv_eval_time=1.07 sec	RMSE=0.147665	R^2=0.316416
[10/500]	cv_eval_time=0.82 sec	RMSE=0.148182	R^2=0.312660
[11/500]	cv_eval_time=0.77 sec	RMSE=0.150365	R^2=0.292052
[12/500]	cv_eval_time=0.87 sec	RMSE=0.147585	R^2=0.318149
[13/500]	cv_eval_time=0.80 sec	RMSE=0.148413	R^2=0.310039
[14/500]	cv_eval_time=0.84 sec	RMSE=0.147800	R^2=0.315781
[15/500]	cv_eval_time=1.26 sec	RMSE=0.140534	R^2=0.381947
[16/500]	cv_eval_time=1.06 sec	RMSE=0.148596	R^2=0.308553
[17/500]	cv_eval_time=0.85 sec	RMSE=0.14858

[142/500]	cv_eval_time=2.17 sec	RMSE=0.146319	R^2=0.329709
[143/500]	cv_eval_time=1.87 sec	RMSE=0.141276	R^2=0.376071
[144/500]	cv_eval_time=1.05 sec	RMSE=0.147481	R^2=0.318700
[145/500]	cv_eval_time=1.35 sec	RMSE=0.139615	R^2=0.389192
[146/500]	cv_eval_time=2.01 sec	RMSE=0.109698	R^2=0.622958
[147/500]	cv_eval_time=1.09 sec	RMSE=0.129831	R^2=0.472014
[148/500]	cv_eval_time=1.31 sec	RMSE=0.139973	R^2=0.385185
[149/500]	cv_eval_time=2.04 sec	RMSE=0.108759	R^2=0.629590
[150/500]	cv_eval_time=1.29 sec	RMSE=0.140616	R^2=0.380814
[151/500]	cv_eval_time=1.81 sec	RMSE=0.142306	R^2=0.364969
[152/500]	cv_eval_time=1.13 sec	RMSE=0.139000	R^2=0.394449
[153/500]	cv_eval_time=1.87 sec	RMSE=0.143393	R^2=0.355984
[154/500]	cv_eval_time=1.91 sec	RMSE=0.123931	R^2=0.519550
[155/500]	cv_eval_time=1.59 sec	RMSE=0.139512	R^2=0.390297
[156/500]	cv_eval_time=1.01 sec	RMSE=0.138995	R^2=0.394413
[157/500]	cv_eval_time=1.81 sec	RMSE=0.144257	R^2=0.347769
[158/500]	cv_eval_time=1.46 sec	RMSE=0.141422	R^2=0.3731

[281/500]	cv_eval_time=1.11 sec	RMSE=0.137623	R^2=0.406638
[282/500]	cv_eval_time=1.54 sec	RMSE=0.119644	R^2=0.551458
[283/500]	cv_eval_time=1.87 sec	RMSE=0.146115	R^2=0.330067
[284/500]	cv_eval_time=1.75 sec	RMSE=0.141163	R^2=0.376618
[285/500]	cv_eval_time=1.46 sec	RMSE=0.128594	R^2=0.481922
[286/500]	cv_eval_time=1.74 sec	RMSE=0.143497	R^2=0.355973
[287/500]	cv_eval_time=2.30 sec	RMSE=0.105713	R^2=0.649834
[288/500]	cv_eval_time=1.37 sec	RMSE=0.139737	R^2=0.388471
[289/500]	cv_eval_time=1.06 sec	RMSE=0.147524	R^2=0.317333
[290/500]	cv_eval_time=0.91 sec	RMSE=0.148531	R^2=0.309331
[291/500]	cv_eval_time=1.91 sec	RMSE=0.144894	R^2=0.342006
[292/500]	cv_eval_time=1.13 sec	RMSE=0.139159	R^2=0.393736
[293/500]	cv_eval_time=1.56 sec	RMSE=0.140715	R^2=0.379832
[294/500]	cv_eval_time=1.00 sec	RMSE=0.137115	R^2=0.411219
[295/500]	cv_eval_time=1.58 sec	RMSE=0.127383	R^2=0.491996
[296/500]	cv_eval_time=1.66 sec	RMSE=0.142341	R^2=0.365237
[297/500]	cv_eval_time=2.96 sec	RMSE=0.104539	R^2=0.6576

[420/500]	cv_eval_time=1.16 sec	RMSE=0.119445	R^2=0.553269
[421/500]	cv_eval_time=2.13 sec	RMSE=0.107167	R^2=0.640285
[422/500]	cv_eval_time=1.68 sec	RMSE=0.142900	R^2=0.360025
[423/500]	cv_eval_time=1.36 sec	RMSE=0.139341	R^2=0.392448
[424/500]	cv_eval_time=2.79 sec	RMSE=0.103729	R^2=0.662995
[425/500]	cv_eval_time=1.25 sec	RMSE=0.129956	R^2=0.470803
[426/500]	cv_eval_time=1.61 sec	RMSE=0.140151	R^2=0.384530
[427/500]	cv_eval_time=1.00 sec	RMSE=0.134243	R^2=0.435295
[428/500]	cv_eval_time=0.89 sec	RMSE=0.148704	R^2=0.308225
[429/500]	cv_eval_time=1.31 sec	RMSE=0.142206	R^2=0.365443
[430/500]	cv_eval_time=0.84 sec	RMSE=0.138655	R^2=0.397814
[431/500]	cv_eval_time=1.30 sec	RMSE=0.141214	R^2=0.375167
[432/500]	cv_eval_time=0.78 sec	RMSE=0.148804	R^2=0.306189
[433/500]	cv_eval_time=1.31 sec	RMSE=0.139515	R^2=0.389906
[434/500]	cv_eval_time=2.71 sec	RMSE=0.107154	R^2=0.640185
[435/500]	cv_eval_time=1.55 sec	RMSE=0.121599	R^2=0.536493
[436/500]	cv_eval_time=1.72 sec	RMSE=0.143701	R^2=0.3527