In [8]:
import sys, warnings
sys.path.append('../code/')
import pandas as pd
import numpy as np
from tqdm import tqdm
from random import shuffle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import SCORERS, mean_squared_error
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [13]:
# create a new NN

# classifiers
reg_dict = {
    'Linear Regression': LinearRegression(), 
    'Feed-Forward Neural Network': MLPRegressor(hidden_layer_sizes=(24, 24, 16))}

# parameters for each classifier
params_dict = {
    'Linear Regression' : {},
    'Feed-Forward Neural Network': {'clf__learning_rate_init':[0.0001,0.001, 0.001],
                                    'clf__alpha':[0,0.0001,0.001]}}

# best model with specific parameters for each classifier
models_dict = {
    'Linear Regression': None, 
    'Feed-Forward Neural Network': None}

os = RandomOverSampler(0.3)
us = RandomUnderSampler(0.4)

In [None]:
# perform parameter optimization on full cascades, for each clf, put resulting pipeline in models_dict
        if c == '':

            model = Pipeline([
                ('oversample', RandomOverSampler(0.3)),
                ('undersample', RandomUnderSampler(0.4)),
                ('reg',reg)
            ])

            #params = params_dict[name]

            #grid = GridSearchCV(model, params,  scoring='neg_mean_squared_error', cv=5)

            #grid.fit(X_train, y_train)

            #model = grid.best_estimator_

            # store best model in model_dicts
            models_dict[name] = model

In [35]:
cnames = ['', '_half_hour', '_1_hour', '_2_hour', '_3_hour']

crops, models, rmses, preds = [], [], [], []

nmodels = len(models_dict)

# loop through crops
for c in cnames:
    
    print('started', c, flush=True)

    df_train = pd.read_csv('../data/grouped' + c + '.csv').sample(frac=1)
    df_test = pd.read_csv('../data/grouped' + c + '_test.csv').sample(frac=1)
    
    X_train, y_train = df_train.drop(['cascade_id', 'virality'], axis = 1).values, df_train.virality.values
    X_test, y_test = df_test.drop(['cascade_id', 'virality'], axis = 1).values, df_test.virality.values
    
    pred = df_test[['cascade_id', 'virality', 'size', 'depth', 'breadth', 'db_ratio']]
    # if no crop --> full
    if c:
        crops += [c[1:]] * nmodels
    else:
        crops += ['full'] * nmodels

    for name, reg in tqdm(reg_dict.items()):

        # fit model and predict for every crop
        model = reg

        model.fit(X_train, y_train)
            
        y_hat = model.predict(X_test)
        pred['y_hat_'+name] = y_hat
        rmse = mean_squared_error(y_test, y_hat)**.5

        models.append(name)
        rmses.append(rmse)
    
    preds.append(pred)

results = pd.DataFrame({'crop':crops, 'model':models, 'mse':rmses})

started 
100%|██████████| 2/2 [00:01<00:00,  1.44it/s]started _half_hour

100%|██████████| 2/2 [00:01<00:00,  1.39it/s]started _1_hour

100%|██████████| 2/2 [00:01<00:00,  1.50it/s]started _2_hour

100%|██████████| 2/2 [00:01<00:00,  1.47it/s]started _3_hour

100%|██████████| 2/2 [00:01<00:00,  1.50it/s]


In [40]:
preds[1]

Unnamed: 0,cascade_id,virality,size,depth,breadth,db_ratio,y_hat_Linear Regression,y_testLinear Regression,y_hat_Feed-Forward Neural Network,y_testFeed-Forward Neural Network
309,82826,2.593907,-0.418494,0.309529,-0.409744,0.194321,2.874073,2.593907,3.154770,2.593907
66,81033,2.434190,-0.512840,-0.511845,-0.517626,0.155999,3.036310,2.434190,3.474894,2.434190
8,80337,2.402772,-0.472924,-0.511845,-0.483104,0.131818,2.663197,2.402772,2.818196,2.402772
127,81489,2.563221,-0.393093,-0.511845,-0.427006,0.114239,2.719265,2.563221,2.536084,2.563221
141,81576,2.170877,0.202009,-0.511845,0.276385,0.086824,2.479358,2.170877,2.096289,2.170877
...,...,...,...,...,...,...,...,...,...,...
278,82634,2.861889,-0.222546,-0.511845,-0.172404,0.093676,2.667467,2.861889,2.654035,2.861889
128,81495,2.624806,0.020575,0.309529,-0.073152,0.120293,2.922311,2.624806,2.555308,2.624806
101,81244,3.086743,-0.494696,0.309529,-0.530572,0.422097,3.059380,3.086743,3.206630,3.086743
175,81817,2.065178,-0.182631,-1.333219,-0.116305,0.060305,2.259117,2.065178,2.382395,2.065178


In [41]:
# print MSE
(results
.groupby(['model', 'crop'])
.agg({'mse':'mean'})
.reset_index()
.pivot(index='model',columns='crop',values='mse'))

crop,1_hour,2_hour,3_hour,full,half_hour
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Feed-Forward Neural Network,0.363669,0.242976,0.269759,0.08948,0.416909
Linear Regression,0.470234,0.425475,0.385814,0.134727,0.48563


In [17]:
# store results
results.to_csv('../data/test_material/baselines_v2.csv', header=True, index=False)