Understanding timeseries

In [None]:
import os
import sys
import json
import zipfile
import numpy as np
import pandas as pd
from urllib.request import urlretrieve

In [None]:
DATA_HOST = "https://archive.ics.uci.edu"
DATA_PATH = "/ml/machine-learning-databases/00321/"
ARCHIVE_NAME = "LD2011_2014.txt.zip"
FILE_NAME = ARCHIVE_NAME[:-4]

def progress_report_hook(count, block_size, total_size):
    mb = int(count * block_size // 1e6)
    if count % 500 == 0:
        sys.stdout.write("\r{} MB downloaded".format(mb))
        sys.stdout.flush()

if not os.path.isfile(FILE_NAME):
    print("downloading dataset (258MB), can take a few minutes depending on your connection")
    urlretrieve(DATA_HOST + DATA_PATH + ARCHIVE_NAME, ARCHIVE_NAME, reporthook=progress_report_hook)

    print("\nextracting data archive")
    zip_ref = zipfile.ZipFile(ARCHIVE_NAME, 'r')
    zip_ref.extractall("./")
    zip_ref.close()
else:
    print("File found skipping download")

In [None]:
data = pd.read_csv(FILE_NAME, sep=";", index_col=0, parse_dates=True, decimal=',')

In [None]:
print(data.shape)
print(data.describe())
# print(data.head)

In [None]:
print(data.head(2))

In [None]:
num_timeseries = data.shape[1]
data_kw = data.resample('2H').sum() / 8
timeseries = []
for i in range(num_timeseries):
    timeseries.append(np.trim_zeros(data_kw.iloc[:,i], trim='f'))

In [None]:
timeseries[1]

In [None]:
# First and last dates
print(timeseries[0].index[0])
print(timeseries[-1].index[-1])

In [None]:
freq = '2H'
prediction_length = 7 * 12
context_length = 7 * 12
start_dataset = pd.Timestamp("2014-01-01 00:00:00", freq=freq)
end_training = pd.Timestamp("2014-09-01 00:00:00", freq=freq)

In [None]:
# timeseries.index.get_loc(timeseries.loc[:,:,'2014-09-01'].index[0])
type(timeseries)

In [None]:
start_training_idx = 0
# Get 1st instance of first row in training series
for end_training_idx, row in enumerate(timeseries[0].index):
    if row == start_dataset:
        print(i, row) 

# Get 1st instance of last row in training series
end_training_idx = 0
for end_training_idx, row in enumerate(timeseries[0].index):
    if row == end_training:
        print(i, row) 
end_training_idx -= 1 # Decrement is required becuase the list offset would include the 1st row of test set
print(timeseries[0].index[end_training_idx])

In [None]:
training_data = [
    {
        "start": str(start_dataset),
        "target": ts[start_training_idx:end_training_idx].tolist()  # We use -1, because pandas indexing includes the upper bound 
    }
    for ts in timeseries
]
print(len(training_data))

In [None]:
num_test_windows = 4

test_data = [
    {
        "start": str(start_dataset),
        "target": ts[start_training_idx:end_training_idx + k * prediction_length].tolist()
    }
    for k in range(1, num_test_windows + 1) 
    for ts in timeseries
]
print(len(test_data))

In [None]:
def write_dicts_to_file(path, data):
    with open(path, 'wb') as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode('utf-8'))

In [None]:
%%time
write_dicts_to_file("train.json", training_data)
write_dicts_to_file("test.json", test_data)

In [None]:
!ls -lh