In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_quantitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)
print(df.columns)

Index(['PAPER_COUNT', 'score', 'NUM_FIRST_POS', 'NUM_SECOND_POS',
       'NUM_THIRD_POS', 'NUM_HIGHER_POS', 'NUM_YEARS_SINCE_FIRST_PUBLICATION',
       'NUM_YEARS_BETWEEN_FIRST_AND_LAST_PUBLICATION',
       'AVG_NUM_PUBLICATIONS_PER_YEAR', 'NUM_INSTITUTIONS',
       'NUM_TOP500_INSTITUTIONS', 'SHANGHAI_RANK', 'NTU_RANK', 'THE_RANK',
       'SHANGHAI_SCORE', 'NTU_SCORE', 'THE_SCORE', 'AVG_TITLE_LENGTH',
       'AVG_ABSTRACT_LENGTH', 'COLLAB_DEGREE_UNWEIGHTED',
       'COLLAB_DEGREE_WEIGHTED', 'COLLAB_CLOSENESS_UNWEIGHTED',
       'COLLAB_CLOSENESS_WEIGHTED', 'COLLAB_BETWEENNESS_UNWEIGHTED',
       'COLLAB_BETWEENNESS_WEIGHTED', 'COLLAB_PAGERANK_UNWEIGHTED',
       'COLLAB_PAGERANK_WEIGHTED', 'COLLAB_EIGENVECTOR_UNWEIGHTED',
       'COLLAB_EIGENVECTOR_WEIGHTED', 'CIT_OUTDEGREE_UNWEIGHTED',
       'CIT_OUTDEGREE_WEIGHTED', 'CIT_CLOSENESS_UNWEIGHTED',
       'CIT_CLOSENESS_WEIGHTED', 'CIT_BETWEENNESS_UNWEIGHTED',
       'CIT_BETWEENNESS_WEIGHTED', 'TOP_SIM_CORPUS', 'TOP_SIM_UNIFORM',
     

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.0007044944640742616}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = lasso.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=3.17 sec	RMSE=0.200235	R^2=-0.000025
[2/500]	cv_eval_time=2.32 sec	RMSE=0.200235	R^2=-0.000037
[3/500]	cv_eval_time=2.21 sec	RMSE=0.200243	R^2=-0.000005
[4/500]	cv_eval_time=2.32 sec	RMSE=0.200236	R^2=-0.000048
[5/500]	cv_eval_time=2.12 sec	RMSE=0.200239	R^2=-0.000008
[6/500]	cv_eval_time=2.18 sec	RMSE=0.200237	R^2=-0.000010
[7/500]	cv_eval_time=2.31 sec	RMSE=0.200236	R^2=-0.000037
[8/500]	cv_eval_time=2.28 sec	RMSE=0.200235	R^2=-0.000097
[9/500]	cv_eval_time=2.33 sec	RMSE=0.200238	R^2=-0.000023
[10/500]	cv_eval_time=2.40 sec	RMSE=0.200238	R^2=-0.000046
[11/500]	cv_eval_time=2.22 sec	RMSE=0.200239	R^2=-0.000091
[12/500]	cv_eval_time=2.18 sec	RMSE=0.200237	R^2=-0.000150
[13/500]	cv_eval_time=2.25 sec	RMSE=0.200236	R^2=-0.000199
[14/500]	cv_eval_time=2.20 sec	RMSE=0.200236	R^2=-0.000054
[15/500]	cv_eval_time=2.25 sec	RMSE=0.200237	R^2=-0.000027
[16/500]	cv_eval_time=2.06 sec	RMSE=0.200239	R^2=-0.000054
[17/500]	cv_eval_time=2.20 

[139/500]	cv_eval_time=1.77 sec	RMSE=0.200235	R^2=-0.000017
[140/500]	cv_eval_time=1.84 sec	RMSE=0.200234	R^2=-0.000053
[141/500]	cv_eval_time=1.82 sec	RMSE=0.200237	R^2=-0.000059
[142/500]	cv_eval_time=1.93 sec	RMSE=0.200249	R^2=-0.000061
[143/500]	cv_eval_time=1.80 sec	RMSE=0.200241	R^2=-0.000178
[144/500]	cv_eval_time=1.79 sec	RMSE=0.200238	R^2=-0.000075
[145/500]	cv_eval_time=1.77 sec	RMSE=0.200234	R^2=-0.000038
[146/500]	cv_eval_time=1.76 sec	RMSE=0.200235	R^2=-0.000036
[147/500]	cv_eval_time=1.79 sec	RMSE=0.200236	R^2=-0.000051
[148/500]	cv_eval_time=1.92 sec	RMSE=0.200235	R^2=-0.000001
[149/500]	cv_eval_time=1.81 sec	RMSE=0.200240	R^2=-0.000057
[150/500]	cv_eval_time=1.85 sec	RMSE=0.200235	R^2=-0.000049
[151/500]	cv_eval_time=1.81 sec	RMSE=0.200237	R^2=-0.000042
[152/500]	cv_eval_time=1.82 sec	RMSE=0.200237	R^2=-0.000117
[153/500]	cv_eval_time=1.81 sec	RMSE=0.200236	R^2=-0.000065
[154/500]	cv_eval_time=1.86 sec	RMSE=0.200238	R^2=-0.000108
[155/500]	cv_eval_time=1.76 sec	RMSE=0.2

[276/500]	cv_eval_time=1.28 sec	RMSE=0.200241	R^2=-0.000088
[277/500]	cv_eval_time=1.35 sec	RMSE=0.200236	R^2=-0.000062
[278/500]	cv_eval_time=1.32 sec	RMSE=0.200240	R^2=-0.000038
[279/500]	cv_eval_time=1.32 sec	RMSE=0.200235	R^2=-0.000050
[280/500]	cv_eval_time=1.31 sec	RMSE=0.200236	R^2=-0.000058
[281/500]	cv_eval_time=17.40 sec	RMSE=0.101632	R^2=0.742366
[282/500]	cv_eval_time=1.26 sec	RMSE=0.200239	R^2=-0.000008
[283/500]	cv_eval_time=2.37 sec	RMSE=0.139365	R^2=0.515531
[284/500]	cv_eval_time=1.24 sec	RMSE=0.200235	R^2=-0.000098
[285/500]	cv_eval_time=1.23 sec	RMSE=0.200237	R^2=-0.000010
[286/500]	cv_eval_time=1.23 sec	RMSE=0.200238	R^2=-0.000036
[287/500]	cv_eval_time=12.97 sec	RMSE=0.094619	R^2=0.776725
[288/500]	cv_eval_time=1.31 sec	RMSE=0.200236	R^2=-0.000111
[289/500]	cv_eval_time=1.32 sec	RMSE=0.200236	R^2=-0.000034
[290/500]	cv_eval_time=1.31 sec	RMSE=0.200240	R^2=-0.000100
[291/500]	cv_eval_time=1.31 sec	RMSE=0.200236	R^2=-0.000036
[292/500]	cv_eval_time=1.30 sec	RMSE=0.20

[413/500]	cv_eval_time=1.28 sec	RMSE=0.200235	R^2=-0.000016
[414/500]	cv_eval_time=1.30 sec	RMSE=0.200239	R^2=-0.000005
[415/500]	cv_eval_time=24.11 sec	RMSE=0.102591	R^2=0.737442
[416/500]	cv_eval_time=1.32 sec	RMSE=0.200237	R^2=-0.000092
[417/500]	cv_eval_time=1.30 sec	RMSE=0.200235	R^2=-0.000015
[418/500]	cv_eval_time=1.84 sec	RMSE=0.151966	R^2=0.424044
[419/500]	cv_eval_time=3.92 sec	RMSE=0.108199	R^2=0.707982
[420/500]	cv_eval_time=1.31 sec	RMSE=0.200237	R^2=-0.000052
[421/500]	cv_eval_time=1.28 sec	RMSE=0.200239	R^2=-0.000073
[422/500]	cv_eval_time=1.30 sec	RMSE=0.200237	R^2=-0.000036
[423/500]	cv_eval_time=1.32 sec	RMSE=0.200239	R^2=-0.000024
[424/500]	cv_eval_time=1.32 sec	RMSE=0.200237	R^2=-0.000012
[425/500]	cv_eval_time=1.31 sec	RMSE=0.200238	R^2=-0.000027
[426/500]	cv_eval_time=1.30 sec	RMSE=0.200237	R^2=-0.000120
[427/500]	cv_eval_time=1.30 sec	RMSE=0.200238	R^2=-0.000019
[428/500]	cv_eval_time=1.33 sec	RMSE=0.200244	R^2=-0.000017
[429/500]	cv_eval_time=12.88 sec	RMSE=0.10