In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_hindex.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)
print(df.columns)

Index(['NUM_FIRST_POS', 'NUM_SECOND_POS', 'NUM_YEARS_SINCE_FIRST_PUBLICATION',
       'NUM_YEARS_BETWEEN_FIRST_AND_LAST_PUBLICATION', 'NUM_INSTITUTIONS',
       'NUM_TOP500_INSTITUTIONS', 'SHANGHAI_RANK', 'NTU_RANK', 'THE_RANK',
       'SHANGHAI_SCORE', 'NTU_SCORE', 'THE_SCORE', 'AVG_TITLE_LENGTH',
       'AVG_ABSTRACT_LENGTH', 'COLLAB_DEGREE_UNWEIGHTED',
       'COLLAB_DEGREE_WEIGHTED', 'COLLAB_CLOSENESS_UNWEIGHTED',
       'COLLAB_CLOSENESS_WEIGHTED', 'COLLAB_BETWEENNESS_UNWEIGHTED',
       'COLLAB_BETWEENNESS_WEIGHTED', 'COLLAB_PAGERANK_UNWEIGHTED',
       'COLLAB_PAGERANK_WEIGHTED', 'COLLAB_EIGENVECTOR_UNWEIGHTED',
       'COLLAB_EIGENVECTOR_WEIGHTED', 'CIT_OUTDEGREE_UNWEIGHTED',
       'CIT_OUTDEGREE_WEIGHTED', 'CIT_CLOSENESS_UNWEIGHTED',
       'CIT_CLOSENESS_WEIGHTED', 'CIT_BETWEENNESS_UNWEIGHTED',
       'CIT_BETWEENNESS_WEIGHTED', 'score', 'TOP_SIM_CORPUS',
       'TOP_SIM_UNIFORM', 'TOP_SIM_PAPERS', 'NUM_TOPICS_GREATER_CORPUS',
       'NUM_TOPICS_GREATER_UNIFORM'],
      dtyp

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.0006577184991258585}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = lasso.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.86 sec	RMSE=0.178670	R^2=-0.000103
[2/500]	cv_eval_time=1.31 sec	RMSE=0.178669	R^2=-0.000031
[3/500]	cv_eval_time=1.57 sec	RMSE=0.178672	R^2=-0.000068
[4/500]	cv_eval_time=1.40 sec	RMSE=0.178674	R^2=-0.000114
[5/500]	cv_eval_time=2.07 sec	RMSE=0.178673	R^2=-0.000045
[6/500]	cv_eval_time=2.13 sec	RMSE=0.178668	R^2=-0.000059
[7/500]	cv_eval_time=2.39 sec	RMSE=0.178669	R^2=-0.000005
[8/500]	cv_eval_time=2.67 sec	RMSE=0.178671	R^2=-0.000020
[9/500]	cv_eval_time=2.48 sec	RMSE=0.178672	R^2=-0.000009
[10/500]	cv_eval_time=3.42 sec	RMSE=0.178674	R^2=-0.000019
[11/500]	cv_eval_time=2.02 sec	RMSE=0.178669	R^2=-0.000066
[12/500]	cv_eval_time=2.02 sec	RMSE=0.178669	R^2=-0.000151
[13/500]	cv_eval_time=2.00 sec	RMSE=0.178676	R^2=-0.000035
[14/500]	cv_eval_time=2.15 sec	RMSE=0.178669	R^2=-0.000149
[15/500]	cv_eval_time=1.92 sec	RMSE=0.178676	R^2=-0.000021
[16/500]	cv_eval_time=2.00 sec	RMSE=0.178669	R^2=-0.000177
[17/500]	cv_eval_time=2.18 

[139/500]	cv_eval_time=1.96 sec	RMSE=0.178669	R^2=-0.000009
[140/500]	cv_eval_time=2.12 sec	RMSE=0.178670	R^2=-0.000035
[141/500]	cv_eval_time=2.69 sec	RMSE=0.178306	R^2=0.003676
[142/500]	cv_eval_time=1.97 sec	RMSE=0.178668	R^2=-0.000179
[143/500]	cv_eval_time=2.40 sec	RMSE=0.178675	R^2=-0.000183
[144/500]	cv_eval_time=1.91 sec	RMSE=0.178669	R^2=-0.000060
[145/500]	cv_eval_time=1.87 sec	RMSE=0.178669	R^2=-0.000032
[146/500]	cv_eval_time=1.79 sec	RMSE=0.178672	R^2=-0.000081
[147/500]	cv_eval_time=22.35 sec	RMSE=0.094738	R^2=0.718849
[148/500]	cv_eval_time=1.63 sec	RMSE=0.178668	R^2=-0.000051
[149/500]	cv_eval_time=4.78 sec	RMSE=0.165395	R^2=0.143064
[150/500]	cv_eval_time=1.69 sec	RMSE=0.178670	R^2=-0.000042
[151/500]	cv_eval_time=1.65 sec	RMSE=0.178669	R^2=-0.000028
[152/500]	cv_eval_time=1.65 sec	RMSE=0.178671	R^2=-0.000020
[153/500]	cv_eval_time=1.78 sec	RMSE=0.178669	R^2=-0.000004
[154/500]	cv_eval_time=1.61 sec	RMSE=0.178672	R^2=-0.000079
[155/500]	cv_eval_time=1.74 sec	RMSE=0.178



[195/500]	cv_eval_time=187.78 sec	RMSE=0.085313	R^2=0.772026
[196/500]	cv_eval_time=1.14 sec	RMSE=0.178669	R^2=-0.000129
[197/500]	cv_eval_time=1.14 sec	RMSE=0.178669	R^2=-0.000009
[198/500]	cv_eval_time=1.12 sec	RMSE=0.178668	R^2=-0.000171
[199/500]	cv_eval_time=1.13 sec	RMSE=0.178671	R^2=-0.000160
[200/500]	cv_eval_time=1.14 sec	RMSE=0.178668	R^2=-0.000034
[201/500]	cv_eval_time=1.12 sec	RMSE=0.178672	R^2=-0.000002
[202/500]	cv_eval_time=1.10 sec	RMSE=0.178671	R^2=-0.000058
[203/500]	cv_eval_time=1.18 sec	RMSE=0.178672	R^2=-0.000032
[204/500]	cv_eval_time=1.20 sec	RMSE=0.178669	R^2=-0.000065
[205/500]	cv_eval_time=1.18 sec	RMSE=0.178674	R^2=-0.000040
[206/500]	cv_eval_time=1.17 sec	RMSE=0.178670	R^2=-0.000027
[207/500]	cv_eval_time=1.18 sec	RMSE=0.178669	R^2=-0.000050
[208/500]	cv_eval_time=1.19 sec	RMSE=0.178669	R^2=-0.000043
[209/500]	cv_eval_time=1.18 sec	RMSE=0.178669	R^2=-0.000028
[210/500]	cv_eval_time=13.68 sec	RMSE=0.118704	R^2=0.558608
[211/500]	cv_eval_time=1.15 sec	RMSE=0.

[332/500]	cv_eval_time=1.19 sec	RMSE=0.178669	R^2=-0.000063
[333/500]	cv_eval_time=1.20 sec	RMSE=0.178669	R^2=-0.000033
[334/500]	cv_eval_time=1.17 sec	RMSE=0.178669	R^2=-0.000073
[335/500]	cv_eval_time=1.17 sec	RMSE=0.178678	R^2=-0.000030
[336/500]	cv_eval_time=1.18 sec	RMSE=0.178676	R^2=-0.000048
[337/500]	cv_eval_time=1.17 sec	RMSE=0.178668	R^2=-0.000102
[338/500]	cv_eval_time=1.13 sec	RMSE=0.178669	R^2=-0.000015
[339/500]	cv_eval_time=1.12 sec	RMSE=0.178669	R^2=-0.000018
[340/500]	cv_eval_time=1.15 sec	RMSE=0.178671	R^2=-0.000063
[341/500]	cv_eval_time=7.49 sec	RMSE=0.085982	R^2=0.768441
[342/500]	cv_eval_time=1.12 sec	RMSE=0.178669	R^2=-0.000022
[343/500]	cv_eval_time=1.12 sec	RMSE=0.178670	R^2=-0.000022
[344/500]	cv_eval_time=1.17 sec	RMSE=0.178672	R^2=-0.000057
[345/500]	cv_eval_time=1.15 sec	RMSE=0.178672	R^2=-0.000146
[346/500]	cv_eval_time=1.19 sec	RMSE=0.178672	R^2=-0.000093
[347/500]	cv_eval_time=1.16 sec	RMSE=0.178670	R^2=-0.000056
[348/500]	cv_eval_time=4.14 sec	RMSE=0.08

[469/500]	cv_eval_time=1.05 sec	RMSE=0.178670	R^2=-0.000049
[470/500]	cv_eval_time=1.08 sec	RMSE=0.178669	R^2=-0.000009
[471/500]	cv_eval_time=1.06 sec	RMSE=0.178671	R^2=-0.000119
[472/500]	cv_eval_time=1.05 sec	RMSE=0.178671	R^2=-0.000176
[473/500]	cv_eval_time=1.04 sec	RMSE=0.178673	R^2=-0.000124
[474/500]	cv_eval_time=1.04 sec	RMSE=0.178675	R^2=-0.000176
[475/500]	cv_eval_time=1.04 sec	RMSE=0.178668	R^2=-0.000160
[476/500]	cv_eval_time=1.04 sec	RMSE=0.178674	R^2=-0.000058
[477/500]	cv_eval_time=1.04 sec	RMSE=0.178668	R^2=-0.000210
[478/500]	cv_eval_time=1.04 sec	RMSE=0.178670	R^2=-0.000144
[479/500]	cv_eval_time=1.06 sec	RMSE=0.178669	R^2=-0.000041
[480/500]	cv_eval_time=1.05 sec	RMSE=0.178670	R^2=-0.000186
[481/500]	cv_eval_time=1.05 sec	RMSE=0.178671	R^2=-0.000163
[482/500]	cv_eval_time=3.40 sec	RMSE=0.102594	R^2=0.670242
[483/500]	cv_eval_time=1.04 sec	RMSE=0.178672	R^2=-0.000024
[484/500]	cv_eval_time=1.04 sec	RMSE=0.178669	R^2=-0.000119
[485/500]	cv_eval_time=1.06 sec	RMSE=0.17