In [14]:
import sys, warnings
sys.path.append('../code/')
import pandas as pd
import numpy as np
from tqdm import tqdm
from random import shuffle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import SCORERS, mean_squared_error, median_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyRegressor


if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [15]:
# create a new NN

# classifiers
reg_dict = {
    'Linear Regression': LinearRegression(), 
    'Feed-Forward Neural Network': MLPRegressor(max_iter=500),
    'Random Forest Regressor': RandomForestRegressor(),
    'Dummy Regressor (Mean)': DummyRegressor(strategy='mean')}

# parameters for each classifier
params_dict = {
    'Linear Regression' : {},
    'Feed-Forward Neural Network': {'hidden_layer_sizes': [(16,16), (16, 32, 16)],
                                    'activation': ['relu','tanh'],
                                    'alpha': [0.0001, 0.05]},
    'Random Forest Regressor': {},
    'Dummy Regressor (Mean)': {}}

# best model with specific parameters for each classifier
models_dict = {
    'Linear Regression': None, 
    'Feed-Forward Neural Network': None,
    'Random Forest Regressor': None,
    'Dummy Regressor (Mean)': None}

In [16]:
cnames = ['_1_hour', '_15_mins', '_30_mins', '_3_hour', '_24_hour']

crops, lin_crops,  models, mses, maes, coefs, feature_names = [], [], [], [], [], [], []
preds = {}

nmodels = len(models_dict)
df_cascade_sizes = pd.read_csv('../data/cascade_size.csv')

# loop through crops
for c in cnames:
    
    print('started', c, flush=True)

    df_train = pd.read_csv('../data/grouped/grouped' + c + '.csv')
    df_test = pd.read_csv('../data/grouped/grouped' + c + '_test.csv')
    
    X_train, X_test = df_train.iloc[:,1:].values, df_test.iloc[:,1:].values
    y_train = pd.merge(df_train, df_cascade_sizes, on='cascade_id', how='left')['cascade_size_log'].values
    y_test = pd.merge(df_test, df_cascade_sizes, on='cascade_id', how='left')['cascade_size_log'].values

    feature_name = df_train.columns.to_list()[1:]
    feature_names += feature_name
    
    pred = pd.merge(df_test, df_cascade_sizes, on='cascade_id', how='left')[['cascade_id', 'cascade_size_log']]

    # if no crop --> full
    crop = ''
    if c:
        crop = [c[1:]]
        crops += crop* nmodels
        lin_crops += crop * len(feature_name)
    else:
        crop = ['full']
        crops += crop * nmodels
        lin_crops += crop * len(feature_name)

    for name, reg in tqdm(reg_dict.items()):
        if c == '_1_hour':

            model = reg

            params = params_dict[name]

            grid = GridSearchCV(model, params,  scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

            grid.fit(X_train, y_train)

            model = grid.best_estimator_

            # store best model in model_dicts
            models_dict[name] = model

        # fit model and predict for every crop
        model = models_dict[name]

        model.fit(X_train, y_train)
            
        y_hat = model.predict(X_test)
        pred['y_hat_'+name] = y_hat
        mse = mean_squared_error(y_test, y_hat)
        mae = median_absolute_error(y_test, y_hat)

        models.append(name)
        mses.append(mse)
        maes.append(mae)

        if name == 'Linear Regression':
            coefs += model.coef_.tolist()

    
    preds[crop[0]] = pred 

results = pd.DataFrame({'crop':crops, 'model':models, 'mse':mses, 'mae':maes})
coefficients = pd.DataFrame({'crop':lin_crops, 'coef':coefs, 'feature_name':feature_names})

started _1_hour
100%|██████████| 4/4 [01:37<00:00, 24.48s/it]started _15_mins

100%|██████████| 4/4 [00:07<00:00,  1.77s/it]started _30_mins

100%|██████████| 4/4 [00:05<00:00,  1.33s/it]started _3_hour

100%|██████████| 4/4 [00:06<00:00,  1.52s/it]started _24_hour

100%|██████████| 4/4 [00:06<00:00,  1.54s/it]


In [18]:
preds['1_hour'].to_csv('../results/preds2.csv', index=False, header=True)

In [9]:
# print MSE
(results
.groupby(['model', 'crop'])
.agg({'mse':'mean'})
.reset_index()
.pivot(index='model',columns='crop',values='mse')
.style.format('{:.4f}'))

crop,15_mins,1_hour,24_hour,30_mins,3_hour
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dummy Regressor (Mean),1.1749,1.183,1.2413,1.3517,1.334
Feed-Forward Neural Network,0.5069,0.5363,0.2388,0.6357,0.5399
Linear Regression,0.5686,0.6165,0.3749,0.7279,0.6494
Random Forest Regressor,0.4737,0.5573,0.2815,0.5509,0.529


In [13]:
# print MAE
(results
.groupby(['model', 'crop'])
.agg({'mae':'median'})
.reset_index()
.pivot(index='model',columns='crop',values='mae')
.style.format('{:.4f}'))

crop,15_mins,1_hour,24_hour,30_mins,3_hour
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dummy Regressor (Mean),0.8663,0.8505,0.8253,0.9004,0.7926
Feed-Forward Neural Network,0.4674,0.393,0.1869,0.4105,0.3824
Linear Regression,0.4853,0.4677,0.3081,0.5023,0.4489
Random Forest Regressor,0.4561,0.4318,0.1745,0.4181,0.3287


In [18]:
coefficients = coefficients.pivot(index='feature_name', columns='crop', values='coef')
coefficients['1_hour'].plot(kind='barh', figsize=(9, 7))
plt.title('Coefficients of linear model')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)
#plt.savefig('coeff.png')


KeyError: 'feature_name'

In [17]:
# store results
results.to_csv('../data/test_material/baselines_v2.csv', header=True, index=False)