In [12]:
import pickle
from os.path import join
import pandas as pd
import numpy as np
from collections import defaultdict
import torch


In [13]:
# load time series for graph

fp = r'C:\Users\rmartinez4\Box\Personal Git\Nautilus-seq2seq\data-scalability-spatiotemporal\data'
g = pickle.load(open(join(fp,'full_graph.pkl'),'rb'))

time_series = list(g.nodes[0]['values'].index)[12:-11]
sensors = [g.nodes[i]['sensor'] for i in g.nodes]

print(len(time_series))
print(len(sensors))

52090
320


In [14]:
print(time_series[0], time_series[-1]) 

2020-01-01 01:05:00 2020-06-29 23:05:00


In [15]:
%%time

# load train, val, and test predictions
train = np.load('./dcrnn_predictions20_train.npz')
val = np.load('./dcrnn_predictions20_val.npz')
test = np.load('./dcrnn_predictions20_test.npz')


Wall time: 9 ms


In [16]:
print(train['prediction'].shape)
print(train['truth'].shape)

print(test['prediction'].shape)
print(test['truth'].shape)

print(val['prediction'].shape)
print(val['truth'].shape)


(12, 36480, 320)
(12, 36480, 320)
(12, 10432, 320)
(12, 10432, 320)
(12, 5248, 320)
(12, 5248, 320)


In [17]:
len_pred = train['prediction'].shape[1] + test['prediction'].shape[1] + val['prediction'].shape[1] 

In [18]:
raw_data_fp = r'C:\Users\rmartinez4\Box\Personal Git\Nautilus-seq2seq\data-scalability-spatiotemporal\25'

raw_train = np.load(join(raw_data_fp, 'train.npz'))
raw_val = np.load(join(raw_data_fp, 'val.npz'))
raw_test = np.load(join(raw_data_fp, 'test.npz'))

batch_size=64

num_padding_train = (batch_size - len(raw_train['x']) % batch_size) % batch_size
num_padding_val = (batch_size - len(raw_val['x']) % batch_size) % batch_size
num_padding_test = (batch_size - len(raw_test['x']) % batch_size) % batch_size

print(num_padding_train)
print(num_padding_val)
print(num_padding_test)

print('Total Padding: {}'.format(num_padding_train+num_padding_val+num_padding_test))

17
39
14
Total Padding: 70


In [19]:
len_raw = len(raw_train['x']) + len(raw_val['x']) + len(raw_test['x'])
print(len_raw)

52090


In [20]:
print('Diff between timeseries and raw: {}'.format(len(time_series)-len_raw))

print('Diff between pred and timeseries: {}'.format(len_pred-len(time_series)))

print('Diff between raw and pred: {}'.format(len_pred-len_raw))


Diff between timeseries and raw: 0
Diff between pred and timeseries: 70
Diff between raw and pred: 70


In [21]:
# unpad train, val, and test to create an unpadded predictions dataset

prediction = np.concatenate((train['prediction'][:,:-num_padding_train,:], 
                             val['prediction'][:,:-num_padding_val,:], 
                             test['prediction'][:,:-num_padding_test,:]), axis=1)

truth = np.concatenate((train['truth'][:,:-num_padding_train,:], 
                             val['truth'][:,:-num_padding_val,:], 
                             test['truth'][:,:-num_padding_test,:]), axis=1)

print(prediction.shape)
print(truth.shape)


(12, 52090, 320)
(12, 52090, 320)


In [22]:
print('Diff between pred (unpadded) and timeseries: {}'.format(prediction.shape[1]-len(time_series)))

print('Diff between pred (unpadded) and timeseries: {}'.format(truth.shape[1]-len(time_series)))

Diff between pred (unpadded) and timeseries: 0
Diff between pred (unpadded) and timeseries: 0


In [23]:
%%time

data_dict = defaultdict(lambda: defaultdict(dict))

horizon = 12
for h in range(horizon): 
    print(h)
    for i in range(len(sensors)):

        P = prediction[h,:,i]
        T = truth[h,:,i]

        data_dict[h+1][sensors[i]] = pd.DataFrame({'pred': P, 'truth': T})

data_dict['time'] = time_series[:len(P)]

0
1
2
3
4
5
6
7
8
9
10
11
Wall time: 7.3 s


In [24]:
data_dict.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 'time'])

In [25]:
data_dict[2]['400001'].head()

Unnamed: 0,pred,truth
0,71.314423,70.800003
1,56.677261,59.799999
2,34.003334,32.200001
3,70.769775,70.599998
4,67.454239,65.800003


In [27]:
data_dict['time'][:10]

[Timestamp('2020-01-01 01:05:00'),
 Timestamp('2020-01-01 01:10:00'),
 Timestamp('2020-01-01 01:15:00'),
 Timestamp('2020-01-01 01:20:00'),
 Timestamp('2020-01-01 01:25:00'),
 Timestamp('2020-01-01 01:30:00'),
 Timestamp('2020-01-01 01:35:00'),
 Timestamp('2020-01-01 01:40:00'),
 Timestamp('2020-01-01 01:45:00'),
 Timestamp('2020-01-01 01:50:00')]

In [28]:
def masked_mae_loss(y_pred, y_true):
    mask = (y_true != 0).float()
    mask /= mask.mean()

    loss = torch.abs(y_pred - y_true)
    loss = loss * mask
    loss[torch.isnan(loss)] = 0

    return loss.mean()

In [54]:
%%time

import plotly.graph_objects as go
from datetime import datetime 

# horizon = [1, 4, 8, 12]
horizon = [12]
station = '400001'
# start_date = datetime.strptime('2020-02-07 8:00:00', '%Y-%m-%d %H:%M:%S')
# end_date = datetime.strptime('2020-02-07 14:35:00', '%Y-%m-%d %H:%M:%S')
start_date = datetime.strptime('2020-02-07', '%Y-%m-%d')
end_date = datetime.strptime('2020-02-20', '%Y-%m-%d')


colors = ['#EF553B', '#636EFA', '#00CC96', '#AB63FA', '#FFA15A', 
          '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52',
          '#2CA02C', '#8C564B', '#7F7F7F']

fig = go.Figure()
idx = 0
for h in horizon:
    
    df = data_dict[h][station]
    df['time'] = data_dict['time']
    df_temp = df[(df.time>=start_date) & (df.time<=end_date)] 
        
    if idx==0:
        y_truth = df_temp.truth
        fig.add_trace(go.Scatter(x=df_temp.time, y=y_truth,
                                    mode='lines+markers',
                                    name='True',
                                    line=dict(
                                        color=colors[idx])
                                )
                     )
        idx+=1
        
    y_pred = df_temp.pred
    mae_error = masked_mae_loss(torch.tensor(y_pred.values), torch.tensor(y_truth.values))
    fig.add_trace(go.Scatter(x=df_temp.time, y=y_pred,
                                mode='lines',
                                name='Predicted ({} pts, MAE: {:.2f})'.format(h, mae_error),
                                line=dict(
                                    color=colors[idx])
                            )
                 )
    idx+=1
    
 
fig.update_layout(
    title='<b>DCRNN Traffic Predictions for Multiple Horizons, each pt is 5 mins</b>',
#     xaxis_title='<b>Time Stamp</b>',
    yaxis_title='<b>Average Speed</b>',
#     legend_title="Legend Title",
    font=dict(
        size=14
    )
)

fig.update_xaxes(
    rangeslider_visible=True
)

    
fig.show()

Wall time: 296 ms


In [30]:
print(data_dict.keys())

print(type(data_dict))

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 'time'])
<class 'collections.defaultdict'>


In [32]:
%%time 

# save dict as pickle file
save_fp = r'C:\Users\rmartinez4\OneDrive - Illumina, Inc\Documents'

file_to_write = open(join(save_fp, 'traffic_pred_horizon12.pkl'), "wb")

pickle.dump(dict(data_dict), file_to_write)


Wall time: 2.79 s


In [33]:
%%time 

save_fp = r'C:\Users\rmartinez4\OneDrive - Illumina, Inc\Documents'

# test reading it
with open(join(save_fp, 'traffic_pred_horizon12.pkl'), 'rb') as f:
    in_data = pickle.load(f)
    

Wall time: 2.3 s


In [34]:
type(in_data)

dict

In [35]:
in_data.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 'time'])

In [7]:
# data_dict[1]['400001']

In [40]:
in_data[2]['400001'].head()

Unnamed: 0,pred,truth
0,71.314423,70.800003
1,56.677261,59.799999
2,34.003334,32.200001
3,70.769775,70.599998
4,67.454239,65.800003
