# Experiments for Paper

This notebook contains all neural network experiments for the paper. The results are saved as CSV files for independent verification.

In [1]:
%load_ext autoreload
%autoreload 2

In [62]:
%matplotlib inline
from nn_src.imports import *

In [26]:
DATA_DIR = '/scratch/srasp/ppnn_data/'
RESULTS_DIR = '/export/home/srasp/repositories/ppnn/results/csv_files/'

In [4]:
def reset_weights(model):
    session = K.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'):
            layer.kernel.initializer.run(session=session)

In [5]:
def ensemble_scores(m, n, x_trn, y_trn, x_test, y_test, **kwargs):
    trn_scores, test_scores, preds = [], [], []
    for i in tqdm(range(n)):
        reset_weights(m)
        m.fit(x_trn, y_trn, **kwargs)
        trn_scores.append(m.evaluate(x_trn, y_trn, 4096, verbose=0))
        test_scores.append(m.evaluate(x_test, y_test, 4096, verbose=0))
        preds.append(m.predict(x_test, 4096, verbose=0))
    return trn_scores, test_scores, preds

In [27]:
def save_ensemble(preds, test_set, exp_name, save=True):
    preds = np.array(preds)
    preds[:, :, 1] = np.abs(preds[:, :, 1])   # Make sure std is positive
    mean_preds = np.mean(preds, 0)
    ens_score = crps_normal(mean_preds[:, 0], mean_preds[:, 1], test_set.targets).mean()
    print(f'Ensemble test score = {ens_score}')
    if save:
        results_df = create_results_df(test_set.date_strs, test_set.station_ids, mean_preds[:, 0], mean_preds[:, 1])
        print(f'Saved results in {RESULTS_DIR}{exp_name}.csv')
        results_df.to_csv(f'{RESULTS_DIR}{exp_name}.csv')

In [7]:
def get_datasets(pickled_name, train_dates, test_dates=['2016-01-01', '2017-01-01'], aux=False, reload=False):
    pickle_fn = f'{DATA_DIR}pickled/{pickled_name}'
    if not os.path.exists(pickle_fn) or reload:
        var_dict = aux_dict if aux else None
        train_set, test_set = get_train_test_sets(
            DATA_DIR,
            train_dates,
            test_dates,
            aux_dict=var_dict,
        )
        # Save pickled dataset
        with open(pickle_fn, 'wb') as f:
            pickle.dump((train_set, test_set), f)
    else:
        with open(pickle_fn, 'rb') as f:
            train_set, test_set = pickle.load(f)
    return train_set, test_set

## Train 2015

In [8]:
train_set, test_set = get_datasets('15_16.pkl', ['2015-01-01', '2016-01-01'], aux=False)

In [9]:
train_set.features.shape, train_set.targets.shape

((180849, 2), (180849,))

In [84]:
aux_train_set, aux_test_set = get_datasets('aux_15_16.pkl', ['2015-01-01', '2016-01-01'], aux=True)

In [30]:
n_features = aux_train_set.features.shape[1]; n_features

40

### Fully connected network

In [10]:
fc = build_fc_model(2, 2, compile=True, lr=0.1)

In [11]:
fc.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 6         
Total params: 6
Trainable params: 6
Non-trainable params: 0
_________________________________________________________________


In [13]:
trn_scores, test_scores, preds = ensemble_scores(
    fc, 10,
    train_set.features, train_set.targets,
    test_set.features, test_set.targets,
    epochs=30, batch_size=4096, verbose=0,
)




In [17]:
test_scores

[1.0129184318453153,
 1.0127042319649044,
 1.012294478162556,
 1.012756661207207,
 1.0125142742484778,
 1.012048500136013,
 1.0122905868857945,
 1.0124105413272844,
 1.0119452775149462,
 1.0123264286334892]

In [28]:
save_ensemble(preds, test_set, 'fc_15')

Ensemble test score = 1.012333025444975
Saved results in /export/home/srasp/repositories/ppnn/results/csv_files/fc_15.csv


### Fully connected network with auxiliary data

In [39]:
fc_aux = build_fc_model(n_features, 2, compile=True, lr=0.02)

In [40]:
fc_aux.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 40)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 82        
Total params: 82
Trainable params: 82
Non-trainable params: 0
_________________________________________________________________


In [43]:
trn_scores, test_scores, preds = ensemble_scores(
    fc_aux, 10,
    aux_train_set.features, aux_train_set.targets,
    aux_test_set.features, aux_test_set.targets,
    epochs=30, batch_size=1024, verbose=0,
)




In [44]:
test_scores

[0.9144555258888823,
 0.9137016375543812,
 0.9182925124955492,
 0.9234101901672326,
 0.9260401680091928,
 0.9197163654858257,
 0.9210973829235432,
 0.9152023871478893,
 0.9193962684983115,
 0.9222847539083309]

In [45]:
save_ensemble(preds, test_set, 'fc_aux_15')

Ensemble test score = 0.9160690354720558
Saved results in /export/home/srasp/repositories/ppnn/results/csv_files/fc_aux_15.csv


### Fully connected network with station embeddings

In [46]:
emb_size = 2
max_id = int(np.max([aux_train_set.cont_ids.max(), aux_test_set.cont_ids.max()]))
max_id

536

In [47]:
fc_emb = build_emb_model(2, 2, [], emb_size, max_id, compile=True, lr=0.02)

In [48]:
fc_emb.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 2)         1074        input_7[0][0]                    
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 2)            0                                            
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 2)            0           embedding_1[0][0]                
__________________________________________________________________________________________________
concatenat

In [49]:
trn_scores, test_scores, preds = ensemble_scores(
    fc_emb, 10,
    [train_set.features, train_set.cont_ids], train_set.targets,
    [test_set.features, test_set.cont_ids], test_set.targets,
    epochs=30, batch_size=1024, verbose=0,
)




In [50]:
test_scores

[0.9134160970905943,
 0.9129536976167542,
 0.9134277858473911,
 0.9127073982580509,
 0.9128624608552308,
 0.911282314291106,
 0.9149842401850841,
 0.916091271955339,
 0.9166416135858159,
 0.9119918908308128]

In [51]:
save_ensemble(preds, test_set, 'fc_emb_15')

Ensemble test score = 0.912276955076594
Saved results in /export/home/srasp/repositories/ppnn/results/csv_files/fc_emb_15.csv


### Fully connected network with auxiliary data and station embeddings

In [52]:
emb_size = 2
max_id = int(np.max([aux_train_set.cont_ids.max(), aux_test_set.cont_ids.max()]))
max_id

536

In [53]:
fc_aux_emb = build_emb_model(n_features, 2, [], emb_size, max_id, compile=True, lr=0.02)

In [54]:
fc_aux_emb.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 2)         1074        input_9[0][0]                    
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 2)            0           embedding_2[0][0]                
__________________________________________________________________________________________________
concatenat

In [55]:
trn_scores, test_scores, preds = ensemble_scores(
    fc_aux_emb, 10,
    [aux_train_set.features, aux_train_set.cont_ids], aux_train_set.targets,
    [aux_test_set.features, aux_test_set.cont_ids], aux_test_set.targets,
    epochs=30, batch_size=1024, verbose=0,
)




In [56]:
test_scores

[0.8773318712557123,
 0.8792898704042933,
 0.8791169621657385,
 0.8760316709484822,
 0.8800153534609878,
 0.875850746922459,
 0.8776595033971417,
 0.8802928969328306,
 0.8782212627710079,
 0.8756937369251041]

In [57]:
save_ensemble(preds, test_set, 'fc_aux_emb_15')

Ensemble test score = 0.8753270056220924
Saved results in /export/home/srasp/repositories/ppnn/results/csv_files/fc_aux_emb_15.csv


### Neural net with auxiliary data and station embeddings

In [85]:
nn_aux_emb = build_emb_model(n_features, 2, [50], emb_size, max_id, compile=True, lr=0.01)

In [86]:
nn_aux_emb.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 1, 2)         1074        input_21[0][0]                   
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, 40)           0                                            
__________________________________________________________________________________________________
flatten_8 (Flatten)             (None, 2)            0           embedding_8[0][0]                
__________________________________________________________________________________________________
concatenat

In [87]:
trn_scores, test_scores, preds = ensemble_scores(
    nn_aux_emb, 10,
    [aux_train_set.features, aux_train_set.cont_ids], aux_train_set.targets,
    [aux_test_set.features, aux_test_set.cont_ids], aux_test_set.targets,
    epochs=30, batch_size=1024, verbose=0,
)




Exception in thread Thread-15:
Traceback (most recent call last):
  File "/export/home/srasp/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/export/home/srasp/anaconda3/lib/python3.6/site-packages/tqdm/_monitor.py", line 63, in run
    for instance in self.tqdm_cls._instances:
  File "/export/home/srasp/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [88]:
test_scores, np.mean(test_scores), np.std(test_scores)

([0.8480887064656757,
  0.8462630754384166,
  0.8614147822296818,
  0.8562129028226138,
  0.8385955247565634,
  0.8429744203315795,
  0.8347998716737812,
  0.8364794055396216,
  0.8505500671043409,
  0.8379988821164724],
 0.8453377638478747,
 0.008406849640801198)

In [90]:
save_ensemble(preds, aux_test_set, 'nn_aux_emb_15')

Ensemble test score = 0.817190709085503
Saved results in /export/home/srasp/repositories/ppnn/results/csv_files/nn_aux_emb_15.csv


## Train 2007-2015

Note that the first two days of 2007 are missing.

In [81]:
train_set_long, test_set_long = get_datasets('07_16.pkl', ['2007-01-03', '2016-01-01'], aux=False)

train set contains 3285 days
test set contains 366 days


In [82]:
aux_train_set_long, aux_test_set_long = get_datasets('aux_07_16.pkl', ['2007-01-03', '2016-01-01'], aux=True)

train set contains 3285 days
test set contains 366 days


### Fully connected network

In [10]:
fc = build_fc_model(2, 2, compile=True, lr=0.1)

In [11]:
fc.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 6         
Total params: 6
Trainable params: 6
Non-trainable params: 0
_________________________________________________________________


In [13]:
trn_scores, test_scores, preds = ensemble_scores(
    fc, 2,
    train_set_long.features, train_set_long.targets,
    test_set_long.features, test_set_long.targets,
    epochs=30, batch_size=4096, verbose=0,
)




In [17]:
test_scores

[1.0129184318453153,
 1.0127042319649044,
 1.012294478162556,
 1.012756661207207,
 1.0125142742484778,
 1.012048500136013,
 1.0122905868857945,
 1.0124105413272844,
 1.0119452775149462,
 1.0123264286334892]

In [28]:
save_ensemble(preds, test_set, 'fc_07-15')

Ensemble test score = 1.012333025444975
Saved results in /export/home/srasp/repositories/ppnn/results/csv_files/fc_15.csv


### Neural network with auxiliary data and station embeddings

In [17]:
emb_model = build_emb_model(n_features, 2, [512], emb_size, max_id, compile=True, lr=0.002)

In [18]:
emb_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 2)         1074        input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 2)            0           embedding_1[0][0]                
__________________________________________________________________________________________________
concatenat

In [27]:
callbacks = [EarlyStopping(monitor='val_loss',patience=3)]

In [36]:
trn_scores, test_scores, preds = ensemble_scores(
    emb_model, 5,
    [aux_train_set.features, aux_train_set.cont_ids], aux_train_set.targets,
    [aux_test_set.features, aux_test_set.cont_ids], aux_test_set.targets,
    epochs=15, batch_size=4096, verbose=1, callbacks=None, 
)

Epoch 1/15
 299008/1456977 [=====>........................] - ETA: 4s - loss: 2.3259

Exception in thread Thread-8:
Traceback (most recent call last):
  File "/export/home/srasp/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/export/home/srasp/anaconda3/lib/python3.6/site-packages/tqdm/_monitor.py", line 63, in run
    for instance in self.tqdm_cls._instances:
  File "/export/home/srasp/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15



In [37]:
test_scores

[0.7845812783046421,
 0.785648775726266,
 0.7820323736911864,
 0.7842717505423521,
 0.779894316266704]

In [38]:
save_ensemble(preds, aux_test_set, '', False)

Ensemble test score = 0.7754476315144285
