In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
from datetime import datetime, timedelta
import pandas as pd
import math
import numpy as np
import random
from tqdm import trange

from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

from math import sqrt
from pandas import read_csv, DataFrame
from scipy import stats

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import utils
params = utils.Params('../init_params.json')

In [2]:
def create_directory(save_path):
    if not os.path.exists(save_path):
        os.makedirs(save_path)

In [3]:
def get_subtracted_date(date_string:str, lag_days:int = 90):
    return (datetime.strptime(date_string, '%Y-%m-%d').date()- timedelta(days=lag_days)).strftime("%Y-%m-%d")

In [4]:
global save_path
csv_path = '../'+params.csv_path #'../Data/Dhaka_Final_v4.csv'
save_name = params.dataset
window_size = params.window_size
stride_size = params.stride_size
num_covariates = params.num_covariates
pred_days = params.pred_days

train_start = '2013-01-01'
train_end = '2016-12-31'
test_start =  get_subtracted_date('2016-12-31', window_size-pred_days-1) #need additional 90 days as given info
test_end = '2017-12-31'
test_start_2 = get_subtracted_date('2017-12-31', window_size-pred_days-1) #need additional 90 days as given info
test_end_2= '2018-12-31'


In [5]:
data_frame_org = pd.read_csv(csv_path, index_col=0, parse_dates=True)
data_frame_org['unix'] = data_frame_org.index.astype(np.int64) // 10 ** 9
pd.to_datetime(data_frame_org['unix'],unit='s')
data_frame = data_frame_org.copy()

In [6]:
(data_frame.iloc[:,0:1].values!=0).argmax(axis=0)

array([0], dtype=int64)

In [7]:
def gen_covariates(data_frame, times, num_covariates, verbose = 0):
    print(times.shape[0])
    covariates = np.zeros((times.shape[0], num_covariates))
    #print(covariates)
    '''for i, input_time in enumerate(times):
        covariates[i, 1] = input_time.weekday()
        covariates[i, 2] = input_time.week
        covariates[i, 3] = input_time.month'''

    covariates[:, 1] = np.sin(2*np.pi*data_frame.index.weekday/7) #input_time.weekday
    covariates[:, 2] = np.cos(2*np.pi*data_frame.index.weekday/7) #input_time.weekday
    covariates[:, 3] = np.sin(2*np.pi*data_frame.index.week/52) #input_time.week
    covariates[:, 4] = np.cos(2*np.pi*data_frame.index.week/52) #input_time.week
    covariates[:, 5] = np.sin(2*np.pi*data_frame.index.month/12) #input_time.month
    covariates[:, 6] = np.cos(2*np.pi*data_frame.index.month/12) #input_time.month
    
    #PM2.5	PM10	SO2	CO	NOx	O3	WS	Temp	RH	RF
    #feature_list = ['#WS','WD','T','RH','#SR','BP','#R','NOX','SO2','O3','CO']
    feature_list = ['WS','Temp','RH','RF','NOx','SO2','O3','CO']
    
    for i,val in enumerate(feature_list):
        #print(i+4, (data_frame[val]-data_frame[val].min())/(data_frame[val].max()-data_frame[val].min()))
        covariates[:, i+7] = (data_frame[val]-data_frame[val].min())/(data_frame[val].max()-data_frame[val].min())
    if verbose:
        for i in range(1,num_covariates):
            print(i, covariates[:,i].min(),covariates[:,i].max())       
    print(covariates[:, :num_covariates].shape, covariates.shape)
    return covariates[:, :num_covariates]

In [8]:
def visualize(data, week_start):
    x = np.arange(window_size)
    f = plt.figure()
    plt.plot(x, data[week_start:week_start+window_size], color='b')
    f.savefig("visual.png")
    plt.close()

In [9]:
data_frame.shape

(2342, 11)

In [10]:
# Resamples the data to hourly frequency
data_frame.resample('D',label = 'left',closed = 'right').sum()[train_start:test_end]

Unnamed: 0_level_0,PM2.5,PM10,SO2,CO,NOx,O3,WS,Temp,RH,RF,unix
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-01-01,267.760636,447.535746,3.9436,0.3086,13.4329,4.5049,1.4104,18.6961,67.1973,0.0467,1357084800
2013-01-02,262.088263,449.070512,3.8702,0.3405,16.0926,5.3954,1.6614,20.0829,68.4158,0.0482,1357171200
2013-01-03,222.661851,372.724157,4.2239,0.3858,19.5855,6.5480,1.2790,20.1200,70.4625,0.0226,1357257600
2013-01-04,164.089142,278.164502,4.0392,0.5365,31.8197,10.6209,1.3413,18.7043,62.1733,0.0200,1357344000
2013-01-05,184.125506,318.132369,3.9436,0.5831,27.4758,9.1723,1.6908,18.3696,58.7264,0.0200,1357430400
...,...,...,...,...,...,...,...,...,...,...,...
2017-12-27,160.469126,222.869226,19.4841,1.3709,82.6087,0.4828,1.2338,19.3742,86.0717,0.0200,1514419200
2017-12-28,212.430986,337.818023,11.6385,1.4957,103.3127,0.5753,1.3541,20.1446,83.4129,0.0200,1514505600
2017-12-29,161.460330,268.324983,10.8066,1.1903,113.8928,0.5632,1.4250,22.1688,68.9958,0.0200,1514592000
2017-12-30,231.033679,340.199059,6.7502,1.0300,78.8729,0.6094,0.6071,21.3067,67.1667,0.0219,1514678400


In [11]:
data_frame.fillna(0, inplace=True)
save_path = os.path.join('data', save_name)
print('save_path', save_path)
create_directory(save_path)

def prep_data(data, covariates, data_start, date_array, train = True, validation = False):
    
    #print("train: ", train)
    time_len = data.shape[0]
    input_size = window_size-stride_size
    windows_per_series = np.full((num_series), (time_len-input_size) // stride_size)

    total_windows = np.sum(windows_per_series)
    #print('total_windows:',total_windows)
    x_input = np.zeros((total_windows, window_size, 1 + num_covariates + 1), dtype='float32')
    label = np.zeros((total_windows, window_size), dtype='float32')
    v_input = np.zeros((total_windows, 2), dtype='float32')
    date_values = np.zeros((total_windows, 1), dtype='int64')
    count = 0 # number of windows
    if not train:
        covariates = covariates[-time_len:]
    for series in range(num_series):
        #print(total_time, data_start[series])
        cov_age = stats.zscore(np.arange(total_time-data_start[series])) # takes zscores of indexes in the series
        if train:
            covariates[data_start[series]:time_len, 0] = cov_age[:time_len-data_start[series]]
        else:
            covariates[:, 0] = cov_age[-time_len:]
        for i in range(windows_per_series[series]): #iterate through the windows of the series: total_observation//stride size
            if train:
                window_start = stride_size*i+data_start[series] # first non zero index: 8759, window_start = 8759
            else:
                window_start = stride_size*i
            window_end = window_start+window_size # window_end = 8951
            print('series:', series, ',count:', count, ',i:', i)
            print("window start: ", window_start)
            print("window end: ", window_end)
            
            x_input[count, 1:, 0] = data[window_start:window_end-1, series] # excluding LAST hour value
            x_input[count, :, 1:1+num_covariates] = covariates[window_start:window_end, :]
            x_input[count, :, -1] = series
            label[count, :] = data[window_start:window_end, series] # label includes last hour which is to be forecasted
            
            date_values[count] = date_array[window_end-1]
            print('date, value', date_values[count][0], label[count, -1])
            nonzero_sum = (x_input[count, 1:input_size, 0]!=0).sum() # summing total 96 value per day excluding 1st and last day
            if nonzero_sum == 0:
                v_input[count, 0] = 0
            else:
                v_input[count, 0] = np.true_divide(x_input[count, 1:input_size, 0].sum(), nonzero_sum)+1
                x_input[count, :, 0] = x_input[count, :, 0]/v_input[count, 0]
                if train:
                    label[count, :] = label[count, :]/v_input[count, 0]
            count += 1
        
    file_name = 'train_' if train else ('test_' if validation else 'test_final_')
    
    print(x_input.shape, v_input.shape, label.shape)
    
#     np.save(save_path+file_name+'dates_'+save_name, date_values)
#     np.save(save_path+file_name+'data_'+save_name, x_input)
#     np.save(save_path+file_name+'v_'+save_name, v_input)
#     np.save(save_path+file_name+'label_'+save_name, label)
    
    np.save( os.path.join(save_path, file_name+'dates_'+save_name), date_values)
    np.save( os.path.join(save_path, file_name+'data_'+save_name), x_input)
    np.save( os.path.join(save_path, file_name+'v_'+save_name), v_input)
    np.save( os.path.join(save_path, file_name+'label_'+save_name), label)
    
    return x_input, label, v_input, date_values

save_path data\Dhaka


In [12]:
### Prepare training data

covariates = gen_covariates(data_frame[train_start:train_end], data_frame[train_start:train_end].index, num_covariates)
train_data = data_frame[train_start:train_end].values
train_dates = data_frame_org[train_start:train_end].unix.values

data_start = (train_data!=0).argmax(axis=0) #find first nonzero value in each time series
total_time = data_frame.shape[0] #32303
num_series = 1 #370

x_input, label, v_input, date_values = prep_data(train_data, covariates, data_start, train_dates)
data_frame_org.loc[data_frame_org.unix == date_values[len(label)-1][0]]

1461
(1461, 16) (1461, 16)
series: 0 ,count: 0 ,i: 0
window start:  0
window end:  98
date, value 1365379200 56.417267
series: 0 ,count: 1 ,i: 1
window start:  1
window end:  99
date, value 1365465600 50.032616
series: 0 ,count: 2 ,i: 2
window start:  2
window end:  100
date, value 1365552000 46.422253
series: 0 ,count: 3 ,i: 3
window start:  3
window end:  101
date, value 1365638400 54.261463
series: 0 ,count: 4 ,i: 4
window start:  4
window end:  102
date, value 1365724800 53.239243
series: 0 ,count: 5 ,i: 5
window start:  5
window end:  103
date, value 1365811200 48.045162
series: 0 ,count: 6 ,i: 6
window start:  6
window end:  104
date, value 1365897600 73.89101
series: 0 ,count: 7 ,i: 7
window start:  7
window end:  105
date, value 1365984000 60.807446
series: 0 ,count: 8 ,i: 8
window start:  8
window end:  106
date, value 1366070400 61.88899
series: 0 ,count: 9 ,i: 9
window start:  9
window end:  107
date, value 1366156800 54.23835
series: 0 ,count: 10 ,i: 10
window start:  10
wi

series: 0 ,count: 161 ,i: 161
window start:  161
window end:  259
date, value 1379289600 53.26181
series: 0 ,count: 162 ,i: 162
window start:  162
window end:  260
date, value 1379376000 42.117146
series: 0 ,count: 163 ,i: 163
window start:  163
window end:  261
date, value 1379462400 21.024574
series: 0 ,count: 164 ,i: 164
window start:  164
window end:  262
date, value 1379548800 14.3688965
series: 0 ,count: 165 ,i: 165
window start:  165
window end:  263
date, value 1379635200 24.579092
series: 0 ,count: 166 ,i: 166
window start:  166
window end:  264
date, value 1379721600 57.114605
series: 0 ,count: 167 ,i: 167
window start:  167
window end:  265
date, value 1379808000 60.29334
series: 0 ,count: 168 ,i: 168
window start:  168
window end:  266
date, value 1379894400 74.9688
series: 0 ,count: 169 ,i: 169
window start:  169
window end:  267
date, value 1379980800 84.71192
series: 0 ,count: 170 ,i: 170
window start:  170
window end:  268
date, value 1380067200 69.71656
series: 0 ,coun

window end:  342
date, value 1386460800 115.70368
series: 0 ,count: 245 ,i: 245
window start:  245
window end:  343
date, value 1386547200 141.27295
series: 0 ,count: 246 ,i: 246
window start:  246
window end:  344
date, value 1386633600 101.81244
series: 0 ,count: 247 ,i: 247
window start:  247
window end:  345
date, value 1386720000 139.82552
series: 0 ,count: 248 ,i: 248
window start:  248
window end:  346
date, value 1386806400 205.31
series: 0 ,count: 249 ,i: 249
window start:  249
window end:  347
date, value 1386892800 171.50116
series: 0 ,count: 250 ,i: 250
window start:  250
window end:  348
date, value 1386979200 179.58626
series: 0 ,count: 251 ,i: 251
window start:  251
window end:  349
date, value 1387065600 190.98666
series: 0 ,count: 252 ,i: 252
window start:  252
window end:  350
date, value 1387152000 182.30818
series: 0 ,count: 253 ,i: 253
window start:  253
window end:  351
date, value 1387238400 141.23357
series: 0 ,count: 254 ,i: 254
window start:  254
window end:  

date, value 1394064000 151.26302
series: 0 ,count: 333 ,i: 333
window start:  333
window end:  431
date, value 1394150400 141.23306
series: 0 ,count: 334 ,i: 334
window start:  334
window end:  432
date, value 1394236800 186.46687
series: 0 ,count: 335 ,i: 335
window start:  335
window end:  433
date, value 1394323200 135.54764
series: 0 ,count: 336 ,i: 336
window start:  336
window end:  434
date, value 1394409600 148.89648
series: 0 ,count: 337 ,i: 337
window start:  337
window end:  435
date, value 1394496000 166.08224
series: 0 ,count: 338 ,i: 338
window start:  338
window end:  436
date, value 1394582400 159.28365
series: 0 ,count: 339 ,i: 339
window start:  339
window end:  437
date, value 1394668800 160.49644
series: 0 ,count: 340 ,i: 340
window start:  340
window end:  438
date, value 1394755200 106.76849
series: 0 ,count: 341 ,i: 341
window start:  341
window end:  439
date, value 1394841600 118.576744
series: 0 ,count: 342 ,i: 342
window start:  342
window end:  440
date, val

series: 0 ,count: 418 ,i: 418
window start:  418
window end:  516
date, value 1401494400 42.79463
series: 0 ,count: 419 ,i: 419
window start:  419
window end:  517
date, value 1401580800 41.259884
series: 0 ,count: 420 ,i: 420
window start:  420
window end:  518
date, value 1401667200 49.62313
series: 0 ,count: 421 ,i: 421
window start:  421
window end:  519
date, value 1401753600 37.522274
series: 0 ,count: 422 ,i: 422
window start:  422
window end:  520
date, value 1401840000 38.65029
series: 0 ,count: 423 ,i: 423
window start:  423
window end:  521
date, value 1401926400 35.126984
series: 0 ,count: 424 ,i: 424
window start:  424
window end:  522
date, value 1402012800 30.024746
series: 0 ,count: 425 ,i: 425
window start:  425
window end:  523
date, value 1402099200 25.473703
series: 0 ,count: 426 ,i: 426
window start:  426
window end:  524
date, value 1402185600 20.807125
series: 0 ,count: 427 ,i: 427
window start:  427
window end:  525
date, value 1402272000 17.731878
series: 0 ,co

window end:  602
date, value 1408924800 40.27031
series: 0 ,count: 505 ,i: 505
window start:  505
window end:  603
date, value 1409011200 30.178705
series: 0 ,count: 506 ,i: 506
window start:  506
window end:  604
date, value 1409097600 25.336407
series: 0 ,count: 507 ,i: 507
window start:  507
window end:  605
date, value 1409184000 26.596546
series: 0 ,count: 508 ,i: 508
window start:  508
window end:  606
date, value 1409270400 26.220987
series: 0 ,count: 509 ,i: 509
window start:  509
window end:  607
date, value 1409356800 24.621706
series: 0 ,count: 510 ,i: 510
window start:  510
window end:  608
date, value 1409443200 27.403297
series: 0 ,count: 511 ,i: 511
window start:  511
window end:  609
date, value 1409529600 15.645011
series: 0 ,count: 512 ,i: 512
window start:  512
window end:  610
date, value 1409616000 23.767302
series: 0 ,count: 513 ,i: 513
window start:  513
window end:  611
date, value 1409702400 23.283863
series: 0 ,count: 514 ,i: 514
window start:  514
window end:

series: 0 ,count: 589 ,i: 589
window start:  589
window end:  687
date, value 1416268800 144.95255
series: 0 ,count: 590 ,i: 590
window start:  590
window end:  688
date, value 1416355200 123.45658
series: 0 ,count: 591 ,i: 591
window start:  591
window end:  689
date, value 1416441600 118.93479
series: 0 ,count: 592 ,i: 592
window start:  592
window end:  690
date, value 1416528000 157.64471
series: 0 ,count: 593 ,i: 593
window start:  593
window end:  691
date, value 1416614400 144.63583
series: 0 ,count: 594 ,i: 594
window start:  594
window end:  692
date, value 1416700800 147.99132
series: 0 ,count: 595 ,i: 595
window start:  595
window end:  693
date, value 1416787200 178.92622
series: 0 ,count: 596 ,i: 596
window start:  596
window end:  694
date, value 1416873600 196.59396
series: 0 ,count: 597 ,i: 597
window start:  597
window end:  695
date, value 1416960000 176.05981
series: 0 ,count: 598 ,i: 598
window start:  598
window end:  696
date, value 1417046400 202.2766
series: 0 ,

series: 0 ,count: 673 ,i: 673
window start:  673
window end:  771
date, value 1423526400 261.34985
series: 0 ,count: 674 ,i: 674
window start:  674
window end:  772
date, value 1423612800 212.85976
series: 0 ,count: 675 ,i: 675
window start:  675
window end:  773
date, value 1423699200 167.76683
series: 0 ,count: 676 ,i: 676
window start:  676
window end:  774
date, value 1423785600 199.3376
series: 0 ,count: 677 ,i: 677
window start:  677
window end:  775
date, value 1423872000 205.74074
series: 0 ,count: 678 ,i: 678
window start:  678
window end:  776
date, value 1423958400 103.25146
series: 0 ,count: 679 ,i: 679
window start:  679
window end:  777
date, value 1424044800 108.20981
series: 0 ,count: 680 ,i: 680
window start:  680
window end:  778
date, value 1424131200 185.90285
series: 0 ,count: 681 ,i: 681
window start:  681
window end:  779
date, value 1424217600 161.26859
series: 0 ,count: 682 ,i: 682
window start:  682
window end:  780
date, value 1424304000 136.12846
series: 0 ,

series: 0 ,count: 758 ,i: 758
window start:  758
window end:  856
date, value 1430870400 51.007423
series: 0 ,count: 759 ,i: 759
window start:  759
window end:  857
date, value 1430956800 65.86511
series: 0 ,count: 760 ,i: 760
window start:  760
window end:  858
date, value 1431043200 29.7597
series: 0 ,count: 761 ,i: 761
window start:  761
window end:  859
date, value 1431129600 36.34428
series: 0 ,count: 762 ,i: 762
window start:  762
window end:  860
date, value 1431216000 37.635956
series: 0 ,count: 763 ,i: 763
window start:  763
window end:  861
date, value 1431302400 49.44658
series: 0 ,count: 764 ,i: 764
window start:  764
window end:  862
date, value 1431388800 64.543434
series: 0 ,count: 765 ,i: 765
window start:  765
window end:  863
date, value 1431475200 60.65966
series: 0 ,count: 766 ,i: 766
window start:  766
window end:  864
date, value 1431561600 40.61708
series: 0 ,count: 767 ,i: 767
window start:  767
window end:  865
date, value 1431648000 36.527554
series: 0 ,count:

series: 0 ,count: 852 ,i: 852
window start:  852
window end:  950
date, value 1438992000 47.980656
series: 0 ,count: 853 ,i: 853
window start:  853
window end:  951
date, value 1439078400 24.153215
series: 0 ,count: 854 ,i: 854
window start:  854
window end:  952
date, value 1439164800 14.312045
series: 0 ,count: 855 ,i: 855
window start:  855
window end:  953
date, value 1439251200 34.161983
series: 0 ,count: 856 ,i: 856
window start:  856
window end:  954
date, value 1439337600 22.915752
series: 0 ,count: 857 ,i: 857
window start:  857
window end:  955
date, value 1439424000 13.778268
series: 0 ,count: 858 ,i: 858
window start:  858
window end:  956
date, value 1439510400 21.758963
series: 0 ,count: 859 ,i: 859
window start:  859
window end:  957
date, value 1439596800 21.700531
series: 0 ,count: 860 ,i: 860
window start:  860
window end:  958
date, value 1439683200 19.876865
series: 0 ,count: 861 ,i: 861
window start:  861
window end:  959
date, value 1439769600 21.846828
series: 0 

series: 0 ,count: 949 ,i: 949
window start:  949
window end:  1047
date, value 1447372800 152.37164
series: 0 ,count: 950 ,i: 950
window start:  950
window end:  1048
date, value 1447459200 141.94907
series: 0 ,count: 951 ,i: 951
window start:  951
window end:  1049
date, value 1447545600 149.42859
series: 0 ,count: 952 ,i: 952
window start:  952
window end:  1050
date, value 1447632000 128.07854
series: 0 ,count: 953 ,i: 953
window start:  953
window end:  1051
date, value 1447718400 178.54564
series: 0 ,count: 954 ,i: 954
window start:  954
window end:  1052
date, value 1447804800 176.48018
series: 0 ,count: 955 ,i: 955
window start:  955
window end:  1053
date, value 1447891200 127.336555
series: 0 ,count: 956 ,i: 956
window start:  956
window end:  1054
date, value 1447977600 106.657295
series: 0 ,count: 957 ,i: 957
window start:  957
window end:  1055
date, value 1448064000 124.8644
series: 0 ,count: 958 ,i: 958
window start:  958
window end:  1056
date, value 1448150400 112.2406


date, value 1455062400 168.9907
series: 0 ,count: 1039 ,i: 1039
window start:  1039
window end:  1137
date, value 1455148800 162.452
series: 0 ,count: 1040 ,i: 1040
window start:  1040
window end:  1138
date, value 1455235200 131.15782
series: 0 ,count: 1041 ,i: 1041
window start:  1041
window end:  1139
date, value 1455321600 150.53467
series: 0 ,count: 1042 ,i: 1042
window start:  1042
window end:  1140
date, value 1455408000 172.23
series: 0 ,count: 1043 ,i: 1043
window start:  1043
window end:  1141
date, value 1455494400 166.27133
series: 0 ,count: 1044 ,i: 1044
window start:  1044
window end:  1142
date, value 1455580800 171.14241
series: 0 ,count: 1045 ,i: 1045
window start:  1045
window end:  1143
date, value 1455667200 176.0135
series: 0 ,count: 1046 ,i: 1046
window start:  1046
window end:  1144
date, value 1455753600 180.88458
series: 0 ,count: 1047 ,i: 1047
window start:  1047
window end:  1145
date, value 1455840000 132.62706
series: 0 ,count: 1048 ,i: 1048
window start:  

date, value 1462406400 68.274445
series: 0 ,count: 1124 ,i: 1124
window start:  1124
window end:  1222
date, value 1462492800 55.216904
series: 0 ,count: 1125 ,i: 1125
window start:  1125
window end:  1223
date, value 1462579200 56.37435
series: 0 ,count: 1126 ,i: 1126
window start:  1126
window end:  1224
date, value 1462665600 46.058414
series: 0 ,count: 1127 ,i: 1127
window start:  1127
window end:  1225
date, value 1462752000 53.40084
series: 0 ,count: 1128 ,i: 1128
window start:  1128
window end:  1226
date, value 1462838400 61.785507
series: 0 ,count: 1129 ,i: 1129
window start:  1129
window end:  1227
date, value 1462924800 55.67083
series: 0 ,count: 1130 ,i: 1130
window start:  1130
window end:  1228
date, value 1463011200 44.200504
series: 0 ,count: 1131 ,i: 1131
window start:  1131
window end:  1229
date, value 1463097600 44.46416
series: 0 ,count: 1132 ,i: 1132
window start:  1132
window end:  1230
date, value 1463184000 44.631126
series: 0 ,count: 1133 ,i: 1133
window start

date, value 1469491200 26.065525
series: 0 ,count: 1206 ,i: 1206
window start:  1206
window end:  1304
date, value 1469577600 38.960903
series: 0 ,count: 1207 ,i: 1207
window start:  1207
window end:  1305
date, value 1469664000 45.99653
series: 0 ,count: 1208 ,i: 1208
window start:  1208
window end:  1306
date, value 1469750400 56.12776
series: 0 ,count: 1209 ,i: 1209
window start:  1209
window end:  1307
date, value 1469836800 65.60673
series: 0 ,count: 1210 ,i: 1210
window start:  1210
window end:  1308
date, value 1469923200 22.342443
series: 0 ,count: 1211 ,i: 1211
window start:  1211
window end:  1309
date, value 1470009600 20.037804
series: 0 ,count: 1212 ,i: 1212
window start:  1212
window end:  1310
date, value 1470096000 11.175345
series: 0 ,count: 1213 ,i: 1213
window start:  1213
window end:  1311
date, value 1470182400 12.69325
series: 0 ,count: 1214 ,i: 1214
window start:  1214
window end:  1312
date, value 1470268800 13.317906
series: 0 ,count: 1215 ,i: 1215
window start

series: 0 ,count: 1303 ,i: 1303
window start:  1303
window end:  1401
date, value 1477958400 86.89019
series: 0 ,count: 1304 ,i: 1304
window start:  1304
window end:  1402
date, value 1478044800 59.066666
series: 0 ,count: 1305 ,i: 1305
window start:  1305
window end:  1403
date, value 1478131200 54.168354
series: 0 ,count: 1306 ,i: 1306
window start:  1306
window end:  1404
date, value 1478217600 65.14126
series: 0 ,count: 1307 ,i: 1307
window start:  1307
window end:  1405
date, value 1478304000 39.772717
series: 0 ,count: 1308 ,i: 1308
window start:  1308
window end:  1406
date, value 1478390400 32.32716
series: 0 ,count: 1309 ,i: 1309
window start:  1309
window end:  1407
date, value 1478476800 61.138123
series: 0 ,count: 1310 ,i: 1310
window start:  1310
window end:  1408
date, value 1478563200 97.28858
series: 0 ,count: 1311 ,i: 1311
window start:  1311
window end:  1409
date, value 1478649600 74.009476
series: 0 ,count: 1312 ,i: 1312
window start:  1312
window end:  1410
date, v

Unnamed: 0_level_0,PM2.5,PM10,SO2,CO,NOx,O3,WS,Temp,RH,RF,unix
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-12-31,186.266422,368.597868,5.2202,1.2599,18.6428,6.2276,0.6071,20.4446,73.2967,0.02,1483142400


In [13]:
### Prepare validation data but named as test data

covariates = gen_covariates(data_frame[test_start:test_end], data_frame[test_start:test_end].index, num_covariates)
test_data = data_frame[test_start:test_end].values
test_dates = data_frame_org[test_start:test_end].unix.values

data_start = (test_data!=0).argmax(axis=0) #find first nonzero value in each time series
total_time = data_frame.shape[0] #32303
num_series = 1 #370

x_input, label, v_input, date_values = prep_data(test_data, covariates, data_start, test_dates, train=0, validation=1)
# Print last row
data_frame_org.loc[data_frame_org.unix == date_values[len(label)-1][0]]

456
(456, 16) (456, 16)
series: 0 ,count: 0 ,i: 0
window start:  0
window end:  98
date, value 1483747200 146.6923
series: 0 ,count: 1 ,i: 1
window start:  1
window end:  99
date, value 1483833600 161.63678
series: 0 ,count: 2 ,i: 2
window start:  2
window end:  100
date, value 1483920000 173.02876
series: 0 ,count: 3 ,i: 3
window start:  3
window end:  101
date, value 1484006400 209.3119
series: 0 ,count: 4 ,i: 4
window start:  4
window end:  102
date, value 1484092800 155.44064
series: 0 ,count: 5 ,i: 5
window start:  5
window end:  103
date, value 1484179200 133.94173
series: 0 ,count: 6 ,i: 6
window start:  6
window end:  104
date, value 1484265600 118.87196
series: 0 ,count: 7 ,i: 7
window start:  7
window end:  105
date, value 1484352000 130.03528
series: 0 ,count: 8 ,i: 8
window start:  8
window end:  106
date, value 1484438400 179.58438
series: 0 ,count: 9 ,i: 9
window start:  9
window end:  107
date, value 1484524800 159.27795
series: 0 ,count: 10 ,i: 10
window start:  10
wind

date, value 1492646400 47.408016
series: 0 ,count: 104 ,i: 104
window start:  104
window end:  202
date, value 1492732800 46.247486
series: 0 ,count: 105 ,i: 105
window start:  105
window end:  203
date, value 1492819200 47.41337
series: 0 ,count: 106 ,i: 106
window start:  106
window end:  204
date, value 1492905600 54.08661
series: 0 ,count: 107 ,i: 107
window start:  107
window end:  205
date, value 1492992000 44.0046
series: 0 ,count: 108 ,i: 108
window start:  108
window end:  206
date, value 1493078400 55.140278
series: 0 ,count: 109 ,i: 109
window start:  109
window end:  207
date, value 1493164800 55.018673
series: 0 ,count: 110 ,i: 110
window start:  110
window end:  208
date, value 1493251200 63.979183
series: 0 ,count: 111 ,i: 111
window start:  111
window end:  209
date, value 1493337600 60.83577
series: 0 ,count: 112 ,i: 112
window start:  112
window end:  210
date, value 1493424000 41.350723
series: 0 ,count: 113 ,i: 113
window start:  113
window end:  211
date, value 149

window start:  239
window end:  337
date, value 1504396800 25.626123
series: 0 ,count: 240 ,i: 240
window start:  240
window end:  338
date, value 1504483200 23.451569
series: 0 ,count: 241 ,i: 241
window start:  241
window end:  339
date, value 1504569600 27.149883
series: 0 ,count: 242 ,i: 242
window start:  242
window end:  340
date, value 1504656000 32.488213
series: 0 ,count: 243 ,i: 243
window start:  243
window end:  341
date, value 1504742400 21.659744
series: 0 ,count: 244 ,i: 244
window start:  244
window end:  342
date, value 1504828800 19.490238
series: 0 ,count: 245 ,i: 245
window start:  245
window end:  343
date, value 1504915200 21.423004
series: 0 ,count: 246 ,i: 246
window start:  246
window end:  344
date, value 1505001600 30.062687
series: 0 ,count: 247 ,i: 247
window start:  247
window end:  345
date, value 1505088000 42.87912
series: 0 ,count: 248 ,i: 248
window start:  248
window end:  346
date, value 1505174400 58.42681
series: 0 ,count: 249 ,i: 249
window start

Unnamed: 0_level_0,PM2.5,PM10,SO2,CO,NOx,O3,WS,Temp,RH,RF,unix
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-12-31,231.033679,340.199059,6.7502,1.03,78.8729,0.6094,0.6071,21.3067,67.1667,0.0219,1514678400


In [14]:
### Prepare Final test data named as test_final

covariates = gen_covariates(data_frame[test_start_2:test_end_2], data_frame[test_start_2:test_end_2].index, num_covariates)
test_data_2 = data_frame[test_start_2:test_end_2].values
test_dates_2 = data_frame_org[test_start_2:test_end_2].unix.values

data_start = (test_data_2!=0).argmax(axis=0) #find first nonzero value in each time series
total_time = data_frame.shape[0] #32303
num_series = 1 #370

x_input, label, v_input, date_values = prep_data(test_data_2, covariates, data_start, test_dates_2, train=0, validation=0)
# Print last row
data_frame_org.loc[data_frame_org.unix == date_values[len(label)-1][0]]

456
(456, 16) (456, 16)
series: 0 ,count: 0 ,i: 0
window start:  0
window end:  98
date, value 1515283200 191.56258
series: 0 ,count: 1 ,i: 1
window start:  1
window end:  99
date, value 1515369600 214.82713
series: 0 ,count: 2 ,i: 2
window start:  2
window end:  100
date, value 1515456000 193.44894
series: 0 ,count: 3 ,i: 3
window start:  3
window end:  101
date, value 1515542400 196.25922
series: 0 ,count: 4 ,i: 4
window start:  4
window end:  102
date, value 1515628800 233.79764
series: 0 ,count: 5 ,i: 5
window start:  5
window end:  103
date, value 1515715200 212.63979
series: 0 ,count: 6 ,i: 6
window start:  6
window end:  104
date, value 1515801600 227.02017
series: 0 ,count: 7 ,i: 7
window start:  7
window end:  105
date, value 1515888000 233.44673
series: 0 ,count: 8 ,i: 8
window start:  8
window end:  106
date, value 1515974400 218.07867
series: 0 ,count: 9 ,i: 9
window start:  9
window end:  107
date, value 1516060800 180.57726
series: 0 ,count: 10 ,i: 10
window start:  10
wi

series: 0 ,count: 90 ,i: 90
window start:  90
window end:  188
date, value 1523059200 99.19602
series: 0 ,count: 91 ,i: 91
window start:  91
window end:  189
date, value 1523145600 67.66831
series: 0 ,count: 92 ,i: 92
window start:  92
window end:  190
date, value 1523232000 100.003456
series: 0 ,count: 93 ,i: 93
window start:  93
window end:  191
date, value 1523318400 69.32368
series: 0 ,count: 94 ,i: 94
window start:  94
window end:  192
date, value 1523404800 67.19512
series: 0 ,count: 95 ,i: 95
window start:  95
window end:  193
date, value 1523491200 77.173134
series: 0 ,count: 96 ,i: 96
window start:  96
window end:  194
date, value 1523577600 65.644356
series: 0 ,count: 97 ,i: 97
window start:  97
window end:  195
date, value 1523664000 59.59718
series: 0 ,count: 98 ,i: 98
window start:  98
window end:  196
date, value 1523750400 59.048843
series: 0 ,count: 99 ,i: 99
window start:  99
window end:  197
date, value 1523836800 62.383366
series: 0 ,count: 100 ,i: 100
window start: 

window start:  177
window end:  275
date, value 1530576000 36.08576
series: 0 ,count: 178 ,i: 178
window start:  178
window end:  276
date, value 1530662400 35.85907
series: 0 ,count: 179 ,i: 179
window start:  179
window end:  277
date, value 1530748800 30.88616
series: 0 ,count: 180 ,i: 180
window start:  180
window end:  278
date, value 1530835200 38.87907
series: 0 ,count: 181 ,i: 181
window start:  181
window end:  279
date, value 1530921600 19.423971
series: 0 ,count: 182 ,i: 182
window start:  182
window end:  280
date, value 1531008000 29.498615
series: 0 ,count: 183 ,i: 183
window start:  183
window end:  281
date, value 1531094400 28.28858
series: 0 ,count: 184 ,i: 184
window start:  184
window end:  282
date, value 1531180800 32.79584
series: 0 ,count: 185 ,i: 185
window start:  185
window end:  283
date, value 1531267200 33.639065
series: 0 ,count: 186 ,i: 186
window start:  186
window end:  284
date, value 1531353600 27.198313
series: 0 ,count: 187 ,i: 187
window start:  1

window end:  359
date, value 1537833600 47.300865
series: 0 ,count: 262 ,i: 262
window start:  262
window end:  360
date, value 1537920000 53.35132
series: 0 ,count: 263 ,i: 263
window start:  263
window end:  361
date, value 1538006400 70.485504
series: 0 ,count: 264 ,i: 264
window start:  264
window end:  362
date, value 1538092800 94.50168
series: 0 ,count: 265 ,i: 265
window start:  265
window end:  363
date, value 1538179200 98.17126
series: 0 ,count: 266 ,i: 266
window start:  266
window end:  364
date, value 1538265600 85.16736
series: 0 ,count: 267 ,i: 267
window start:  267
window end:  365
date, value 1538352000 95.56239
series: 0 ,count: 268 ,i: 268
window start:  268
window end:  366
date, value 1538438400 113.64206
series: 0 ,count: 269 ,i: 269
window start:  269
window end:  367
date, value 1538524800 79.59339
series: 0 ,count: 270 ,i: 270
window start:  270
window end:  368
date, value 1538611200 85.82409
series: 0 ,count: 271 ,i: 271
window start:  271
window end:  369


Unnamed: 0_level_0,PM2.5,PM10,SO2,CO,NOx,O3,WS,Temp,RH,RF,unix
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-12-31,141.009959,215.653731,7.2371,1.2198,48.2369,16.1003,0.7333,14.345,68.9958,0.0567,1546214400


## Check Prepared Data

In [15]:
def covariate_test(x_input_train, num_covariates, covariates):
    covariates = covariates.astype(np.float32)
    for i in range(0,num_covariates):
        #print('i',i)
        if (covariates[i,:] == x_input_train[i,0, 1:-1]).all(): # check if covariates matches
            print("sample {}: covariates matched with prepared dataset".format(i))

def sequential_test(x_input_test, num_samples=10):
    for index in range(num_samples):
        if (x_input_test[index,1, 1:]==x_input_test[index+1,0, 1:]).all():
            print("sample {}, {}: sequential test passed".format(index, index+1))

In [16]:
covariate_test(x_input.copy(), num_covariates, covariates)

sample 0: covariates matched with prepared dataset
sample 1: covariates matched with prepared dataset
sample 2: covariates matched with prepared dataset
sample 3: covariates matched with prepared dataset
sample 4: covariates matched with prepared dataset
sample 5: covariates matched with prepared dataset
sample 6: covariates matched with prepared dataset
sample 7: covariates matched with prepared dataset
sample 8: covariates matched with prepared dataset
sample 9: covariates matched with prepared dataset
sample 10: covariates matched with prepared dataset
sample 11: covariates matched with prepared dataset
sample 12: covariates matched with prepared dataset
sample 13: covariates matched with prepared dataset
sample 14: covariates matched with prepared dataset
sample 15: covariates matched with prepared dataset


In [17]:
sequential_test(x_input.copy(), num_samples=10)

sample 0, 1: sequential test passed
sample 1, 2: sequential test passed
sample 2, 3: sequential test passed
sample 3, 4: sequential test passed
sample 4, 5: sequential test passed
sample 5, 6: sequential test passed
sample 6, 7: sequential test passed
sample 7, 8: sequential test passed
sample 8, 9: sequential test passed
sample 9, 10: sequential test passed
