Understanding timeseries

In [75]:
import os
import sys
import json
import zipfile
import numpy as np
import pandas as pd
from urllib.request import urlretrieve

In [11]:
DATA_HOST = "https://archive.ics.uci.edu"
DATA_PATH = "/ml/machine-learning-databases/00321/"
ARCHIVE_NAME = "LD2011_2014.txt.zip"
FILE_NAME = ARCHIVE_NAME[:-4]

def progress_report_hook(count, block_size, total_size):
    mb = int(count * block_size // 1e6)
    if count % 500 == 0:
        sys.stdout.write("\r{} MB downloaded".format(mb))
        sys.stdout.flush()

if not os.path.isfile(FILE_NAME):
    print("downloading dataset (258MB), can take a few minutes depending on your connection")
    urlretrieve(DATA_HOST + DATA_PATH + ARCHIVE_NAME, ARCHIVE_NAME, reporthook=progress_report_hook)

    print("\nextracting data archive")
    zip_ref = zipfile.ZipFile(ARCHIVE_NAME, 'r')
    zip_ref.extractall("./")
    zip_ref.close()
else:
    print("File found skipping download")

File found skipping download


In [13]:
data = pd.read_csv(FILE_NAME, sep=";", index_col=0, parse_dates=True, decimal=',')

<bound method NDFrame.head of                        MT_001     MT_002    MT_003      MT_004     MT_005  \
2011-01-01 00:15:00  0.000000   0.000000  0.000000    0.000000   0.000000   
2011-01-01 00:30:00  0.000000   0.000000  0.000000    0.000000   0.000000   
2011-01-01 00:45:00  0.000000   0.000000  0.000000    0.000000   0.000000   
2011-01-01 01:00:00  0.000000   0.000000  0.000000    0.000000   0.000000   
2011-01-01 01:15:00  0.000000   0.000000  0.000000    0.000000   0.000000   
...                       ...        ...       ...         ...        ...   
2014-12-31 23:00:00  2.538071  22.048364  1.737619  150.406504  85.365854   
2014-12-31 23:15:00  2.538071  21.337127  1.737619  166.666667  81.707317   
2014-12-31 23:30:00  2.538071  20.625889  1.737619  162.601626  82.926829   
2014-12-31 23:45:00  1.269036  21.337127  1.737619  166.666667  85.365854   
2015-01-01 00:00:00  2.538071  19.914651  1.737619  178.861789  84.146341   

                         MT_006     MT_007   

In [16]:
print(data.shape)
print(data.describe())
# print(data.head)

(140256, 370)
              MT_001         MT_002         MT_003         MT_004  \
count  140256.000000  140256.000000  140256.000000  140256.000000   
mean        3.970785      20.768480       2.918308      82.184490   
std         5.983965      13.272415      11.014456      58.248392   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       2.844950       0.000000      36.585366   
50%         1.269036      24.893314       1.737619      87.398374   
75%         2.538071      29.871977       1.737619     115.853659   
max        48.223350     115.220484     151.172893     321.138211   

              MT_005         MT_006         MT_007         MT_008  \
count  140256.000000  140256.000000  140256.000000  140256.000000   
mean       37.240309     141.227385       4.521338     191.401476   
std        26.461327      98.439984       6.485684     121.981187   
min         0.000000       0.000000       0.000000       0.000000   
25%        15.85365

In [19]:
print(data.head(2))

                     MT_001  MT_002  MT_003  MT_004  MT_005  MT_006  MT_007  \
2011-01-01 00:15:00     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2011-01-01 00:30:00     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

                     MT_008  MT_009  MT_010  ...  MT_361  MT_362  MT_363  \
2011-01-01 00:15:00     0.0     0.0     0.0  ...     0.0     0.0     0.0   
2011-01-01 00:30:00     0.0     0.0     0.0  ...     0.0     0.0     0.0   

                     MT_364  MT_365  MT_366  MT_367  MT_368  MT_369  MT_370  
2011-01-01 00:15:00     0.0     0.0     0.0     0.0     0.0     0.0     0.0  
2011-01-01 00:30:00     0.0     0.0     0.0     0.0     0.0     0.0     0.0  

[2 rows x 370 columns]


In [12]:
num_timeseries = data.shape[1]
data_kw = data.resample('2H').sum() / 8
timeseries = []
for i in range(num_timeseries):
    timeseries.append(np.trim_zeros(data_kw.iloc[:,i], trim='f'))

In [30]:
timeseries[1]

2012-01-01 00:00:00    19.825747
2012-01-01 02:00:00    22.848506
2012-01-01 04:00:00    21.959459
2012-01-01 06:00:00    22.137269
2012-01-01 08:00:00    22.581792
                         ...    
2014-12-31 16:00:00    28.538407
2014-12-31 18:00:00    28.449502
2014-12-31 20:00:00    24.004267
2014-12-31 22:00:00    21.692745
2015-01-01 00:00:00     2.489331
Freq: 2H, Name: MT_002, Length: 13153, dtype: float64

In [39]:
# First and last dates
print(timeseries[0].index[0])
print(timeseries[-1].index[-1])

2012-01-01 00:00:00
2015-01-01 00:00:00


In [33]:
freq = '2H'
prediction_length = 7 * 12
context_length = 7 * 12
start_dataset = pd.Timestamp("2014-01-01 00:00:00", freq=freq)
end_training = pd.Timestamp("2014-09-01 00:00:00", freq=freq)

In [44]:
# timeseries.index.get_loc(timeseries.loc[:,:,'2014-09-01'].index[0])
type(timeseries)

list

In [68]:
start_training_idx = 0
# Get 1st instance of first row in training series
for end_training_idx, row in enumerate(timeseries[0].index):
    if row == start_dataset:
        print(i, row) 

# Get 1st instance of last row in training series
end_training_idx = 0
for end_training_idx, row in enumerate(timeseries[0].index):
    if row == end_training:
        print(i, row) 
end_training_idx -= 1 # Decrement is required becuase the list offset would include the 1st row of test set
print(timeseries[0].index[end_training_idx])

13152 2014-01-01 00:00:00
13152 2014-09-01 00:00:00
2014-12-31 22:00:00


In [69]:
training_data = [
    {
        "start": str(start_dataset),
        "target": ts[start_training_idx:end_training_idx].tolist()  # We use -1, because pandas indexing includes the upper bound 
    }
    for ts in timeseries
]
print(len(training_data))

370


In [70]:
num_test_windows = 4

test_data = [
    {
        "start": str(start_dataset),
        "target": ts[start_training_idx:end_training_idx + k * prediction_length].tolist()
    }
    for k in range(1, num_test_windows + 1) 
    for ts in timeseries
]
print(len(test_data))

1480


In [73]:
def write_dicts_to_file(path, data):
    with open(path, 'wb') as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode('utf-8'))

In [76]:
%%time
write_dicts_to_file("train.json", training_data)
write_dicts_to_file("test.json", test_data)

CPU times: user 14.1 s, sys: 146 ms, total: 14.2 s
Wall time: 14.3 s


In [78]:
!ls -lh

total 2813448
-rw-r--r--  1 k  staff   678M May 25 10:14 LD2011_2014.txt
-rw-r--r--  1 k  staff   249M May 25 10:13 LD2011_2014.txt.zip
-rw-r--r--  1 k  staff    18K May 25 21:31 TSbreakdown.ipynb
drwxr-xr-x  3 k  staff    96B May 25 10:14 [34m__MACOSX[m[m
-rw-r--r--  1 k  staff   543B Jan 20 10:48 findInDir.js
-rw-r--r--  1 k  staff   333M May 25 21:31 test.json
-rw-r--r--  1 k  staff    83M May 25 21:31 train.json
