In [6]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

In [7]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_quantitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)
print(df.columns)

Index(['PAPER_COUNT', 'score', 'NUM_FIRST_POS', 'NUM_SECOND_POS',
       'NUM_THIRD_POS', 'NUM_HIGHER_POS', 'NUM_YEARS_SINCE_FIRST_PUBLICATION',
       'NUM_YEARS_BETWEEN_FIRST_AND_LAST_PUBLICATION',
       'AVG_NUM_PUBLICATIONS_PER_YEAR', 'NUM_INSTITUTIONS',
       'NUM_TOP500_INSTITUTIONS', 'SHANGHAI_RANK', 'NTU_RANK', 'THE_RANK',
       'SHANGHAI_SCORE', 'NTU_SCORE', 'THE_SCORE', 'AVG_TITLE_LENGTH',
       'AVG_ABSTRACT_LENGTH', 'COLLAB_DEGREE_UNWEIGHTED',
       'COLLAB_DEGREE_WEIGHTED', 'COLLAB_CLOSENESS_UNWEIGHTED',
       'COLLAB_CLOSENESS_WEIGHTED', 'COLLAB_BETWEENNESS_UNWEIGHTED',
       'COLLAB_BETWEENNESS_WEIGHTED', 'COLLAB_PAGERANK_UNWEIGHTED',
       'COLLAB_PAGERANK_WEIGHTED', 'COLLAB_EIGENVECTOR_UNWEIGHTED',
       'COLLAB_EIGENVECTOR_WEIGHTED', 'CIT_OUTDEGREE_UNWEIGHTED',
       'CIT_OUTDEGREE_WEIGHTED', 'CIT_CLOSENESS_UNWEIGHTED',
       'CIT_CLOSENESS_WEIGHTED', 'CIT_BETWEENNESS_UNWEIGHTED',
       'CIT_BETWEENNESS_WEIGHTED', 'TOP_SIM_CORPUS', 'TOP_SIM_UNIFORM',
     

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.0007044944640742616}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = lasso.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.00 sec	RMSE=0.171113	R^2=0.269274
[2/500]	cv_eval_time=1.40 sec	RMSE=0.170466	R^2=0.274985
[3/500]	cv_eval_time=1.58 sec	RMSE=0.170028	R^2=0.279169
[4/500]	cv_eval_time=1.84 sec	RMSE=0.169954	R^2=0.280070
[5/500]	cv_eval_time=2.76 sec	RMSE=0.163504	R^2=0.334033
[6/500]	cv_eval_time=1.62 sec	RMSE=0.170819	R^2=0.272319
[7/500]	cv_eval_time=1.90 sec	RMSE=0.172154	R^2=0.260568
[8/500]	cv_eval_time=1.70 sec	RMSE=0.173090	R^2=0.252906
[9/500]	cv_eval_time=1.47 sec	RMSE=0.171589	R^2=0.265472
[10/500]	cv_eval_time=1.53 sec	RMSE=0.170204	R^2=0.277128
[11/500]	cv_eval_time=1.45 sec	RMSE=0.170810	R^2=0.271907
[12/500]	cv_eval_time=1.48 sec	RMSE=0.172023	R^2=0.262116
[13/500]	cv_eval_time=1.73 sec	RMSE=0.171250	R^2=0.268169
[14/500]	cv_eval_time=2.41 sec	RMSE=0.170212	R^2=0.277433
[15/500]	cv_eval_time=2.35 sec	RMSE=0.170154	R^2=0.277811
[16/500]	cv_eval_time=2.21 sec	RMSE=0.170749	R^2=0.273581
[17/500]	cv_eval_time=2.24 sec	RMSE=0.17188

[141/500]	cv_eval_time=1.46 sec	RMSE=0.170751	R^2=0.272832
[142/500]	cv_eval_time=3.48 sec	RMSE=0.168716	R^2=0.291107
[143/500]	cv_eval_time=2.34 sec	RMSE=0.169756	R^2=0.279053
[144/500]	cv_eval_time=2.61 sec	RMSE=0.149063	R^2=0.445338
[145/500]	cv_eval_time=1.45 sec	RMSE=0.170552	R^2=0.274633
[146/500]	cv_eval_time=3.03 sec	RMSE=0.129320	R^2=0.582955
[147/500]	cv_eval_time=9.36 sec	RMSE=0.111161	R^2=0.691405
[148/500]	cv_eval_time=1.60 sec	RMSE=0.160077	R^2=0.361003
[149/500]	cv_eval_time=1.91 sec	RMSE=0.137596	R^2=0.527404
[150/500]	cv_eval_time=1.94 sec	RMSE=0.162348	R^2=0.343406
[151/500]	cv_eval_time=5.35 sec	RMSE=0.110912	R^2=0.693072
[152/500]	cv_eval_time=2.09 sec	RMSE=0.143696	R^2=0.485169
[153/500]	cv_eval_time=1.84 sec	RMSE=0.160882	R^2=0.354461
[154/500]	cv_eval_time=1.90 sec	RMSE=0.164117	R^2=0.327563
[155/500]	cv_eval_time=1.36 sec	RMSE=0.154725	R^2=0.402812
[156/500]	cv_eval_time=1.35 sec	RMSE=0.160569	R^2=0.356908
[157/500]	cv_eval_time=1.89 sec	RMSE=0.161871	R^2=0.3459

[280/500]	cv_eval_time=2.42 sec	RMSE=0.146178	R^2=0.467442
[281/500]	cv_eval_time=3.01 sec	RMSE=0.161803	R^2=0.347492
[282/500]	cv_eval_time=2.51 sec	RMSE=0.160471	R^2=0.357300
[283/500]	cv_eval_time=8.36 sec	RMSE=0.111459	R^2=0.690018
[284/500]	cv_eval_time=1.39 sec	RMSE=0.171000	R^2=0.270692
[285/500]	cv_eval_time=2.89 sec	RMSE=0.136227	R^2=0.536988
[286/500]	cv_eval_time=1.69 sec	RMSE=0.160056	R^2=0.361216
[287/500]	cv_eval_time=2.72 sec	RMSE=0.163146	R^2=0.335546
[288/500]	cv_eval_time=1.41 sec	RMSE=0.171464	R^2=0.267070
[289/500]	cv_eval_time=2.39 sec	RMSE=0.149346	R^2=0.443754
[290/500]	cv_eval_time=1.27 sec	RMSE=0.170744	R^2=0.272671
[291/500]	cv_eval_time=9.96 sec	RMSE=0.117829	R^2=0.653931
[292/500]	cv_eval_time=2.01 sec	RMSE=0.160904	R^2=0.353886
[293/500]	cv_eval_time=1.31 sec	RMSE=0.169837	R^2=0.279956
[294/500]	cv_eval_time=2.03 sec	RMSE=0.164630	R^2=0.323204
[295/500]	cv_eval_time=1.42 sec	RMSE=0.169768	R^2=0.280832
[296/500]	cv_eval_time=1.65 sec	RMSE=0.139174	R^2=0.5172

[419/500]	cv_eval_time=3.43 sec	RMSE=0.161953	R^2=0.345797
[420/500]	cv_eval_time=3.30 sec	RMSE=0.160553	R^2=0.357329
[421/500]	cv_eval_time=1.58 sec	RMSE=0.154392	R^2=0.405566
[422/500]	cv_eval_time=5.53 sec	RMSE=0.106942	R^2=0.714825
[423/500]	cv_eval_time=1.78 sec	RMSE=0.161350	R^2=0.350757
[424/500]	cv_eval_time=1.64 sec	RMSE=0.139279	R^2=0.516434
[425/500]	cv_eval_time=1.20 sec	RMSE=0.160322	R^2=0.358828
[426/500]	cv_eval_time=1.95 sec	RMSE=0.160822	R^2=0.354977
[427/500]	cv_eval_time=2.74 sec	RMSE=0.137069	R^2=0.531404
[428/500]	cv_eval_time=2.41 sec	RMSE=0.148403	R^2=0.450372
[429/500]	cv_eval_time=10.73 sec	RMSE=0.110036	R^2=0.698064
[430/500]	cv_eval_time=3.85 sec	RMSE=0.164305	R^2=0.325873
[431/500]	cv_eval_time=4.12 sec	RMSE=0.162847	R^2=0.337728
[432/500]	cv_eval_time=4.20 sec	RMSE=0.159767	R^2=0.363453
[433/500]	cv_eval_time=7.30 sec	RMSE=0.160403	R^2=0.357986
[434/500]	cv_eval_time=10.87 sec	RMSE=0.162232	R^2=0.343965
[435/500]	cv_eval_time=5.80 sec	RMSE=0.144689	R^2=0.47