# Traffic Dataset

There are two files for each fold, the data file and the labels file. We have split the 440 time series between train and test folds, but you are of course free to merge them to consider a different cross validation setting.
- The PEMS_train textfile has 263 lines. Each line describes a time-series provided as a matrix. The matrix syntax is that of Matlab, e.g. [ a b ; c d] is the matrix with row vectors [a b] and [c d] in that order. Each matrix describes the different occupancies rates (963 lines, one for each station/detector) sampled every 10 minutes during the day (144 columns).
- The PEMS_trainlabel text describes, for each day of measurements described above, the day of the week on which the data was sampled, namely an integer between 1 (Mon.) and 7 (Sun.).

- PEMS_test and PEMS_testlabels are formatted in the same way, except that there are 173 test instances.

- The permutation that I used to shuffle the dataset is given in the randperm file. If you need to rearrange the data so that it follows the calendar order, you should merge train and test samples and reorder them using the inverse permutation of randperm.

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
from io import StringIO
from zipfile import ZipFile

import pandas as pd
from pandas import DataFrame, Series

from tsdm.datasets import Traffic

In [None]:
self = Traffic(initialize=False)

In [None]:
self.timeseries

In [None]:
from abc import ABC, abstractmethod


class Foo(ABC):
    @property
    @abstractmethod
    def foo(self) -> int:
        return 0


class Bar(Foo):
    foo = 1  # mypy ignores this complain!


obj = Bar()
obj.foo = 2

In [None]:
Traffic(initialize=False).weekdays

In [None]:
Traffic.dates.__get__(self)

In [None]:
false_dates = pd.date_range("2008-01-01", "2009-03-30", freq="d", name="day")
false_anomalies = pd.DatetimeIndex({
    "Jan. 1, 2008": "New Year’s Day",
    "Jan. 21, 2008": "Martin Luther King Jr. Day",
    "Feb. 18, 2008": "Washington’s Birthday",
    "Mar. 9, 2008": "Anomaly day",
    "May 26, 2008": "Memorial Day",
    "Jul. 4, 2008": "Independence Day",
    "Sep. 1, 2008": "Labor Day",
    "Oct. 13, 2008": "Columbus Day",
    "Nov. 11, 2008": "Veterans Day",
    "Nov. 27, 2008": "Thanksgiving",
    "Dec. 25, 2008": "Christmas Day",
    "Jan. 1, 2009": "New Year’s Day",
    "Jan. 19, 2009": "Martin Luther King Jr. Day",
    "Feb. 16, 2009": "Washington’s Birthday",
    "Mar. 8, 2009": "Anomaly day",
})
false_weekdays = {
    "1": "Sunday",
    "2": "Monday",
    "3": "Tuesday",
    "4": "Wednesday",
    "5": "Thursday",
    "6": "Friday",
    "7": "Saturday",
}
use_corrected_dates = False
dates = true_dates if use_corrected_dates else false_dates
anomalies = true_anomalies if use_corrected_dates else false_anomalies
weekdays = true_weekdays if use_corrected_dates else false_weekdays

# remove anomalies
mask = dates.isin(anomalies)
assert sum(mask) == len(anomalies)
dates = dates[~mask]

In [None]:
pd.timedelta_range("0:00:00", "23:59:59", freq="10min", name="time")

In [None]:
use_corrected_dates = True

true_dates = pd.date_range("2008-01-01", "2009-03-26", freq="d", name="day")
true_anomalies = pd.DatetimeIndex({
    "2008-01-01": "New Year’s Day",
    "2008-01-21": "Martin Luther King Jr. Day",
    "2008-02-18": "Washington’s Birthday",
    "2008-03-09": "anomaly",
    "2008-05-26": "Memorial Day",
    "2008-07-04": "Independence Day",
    "2008-09-01": "Labor Day",
    "2008-10-20": "???",
    "2008-11-17": "???",
    "2008-12-07": "???",
    "2009-02-23": "???",
    # "2009-03-08": "anomaly",
})
true_weekdays = {
    1: "Sunday",
    2: "Monday",
    3: "Tuesday",
    4: "Wednesday",
    5: "Thursday",
    6: "Friday",
    7: "Saturday",
}

false_dates = pd.date_range("2008-01-01", "2009-03-30", freq="d", name="day")
false_anomalies = pd.DatetimeIndex({
    "Jan. 1, 2008": "New Year’s Day",
    "Jan. 21, 2008": "Martin Luther King Jr. Day",
    "Feb. 18, 2008": "Washington’s Birthday",
    "Mar. 9, 2008": "Anomaly day",
    "May 26, 2008": "Memorial Day",
    "Jul. 4, 2008": "Independence Day",
    "Sep. 1, 2008": "Labor Day",
    "Oct. 13, 2008": "Columbus Day",
    "Nov. 11, 2008": "Veterans Day",
    "Nov. 27, 2008": "Thanksgiving",
    "Dec. 25, 2008": "Christmas Day",
    "Jan. 1, 2009": "New Year’s Day",
    "Jan. 19, 2009": "Martin Luther King Jr. Day",
    "Feb. 16, 2009": "Washington’s Birthday",
    "Mar. 8, 2009": "Anomaly day",
})
false_weekdays = {
    1: "Monday",
    2: "Tuesday",
    3: "Wednesday",
    4: "Thursday",
    5: "Friday",
    6: "Saturday",
    7: "Sunday",
}

dates = true_dates if use_corrected_dates else false_dates
anomalies = true_anomalies if use_corrected_dates else false_anomalies
weekdays = true_weekdays if use_corrected_dates else false_weekdays

# remove anomalies
dates = dates[~dates.isin(anomalies)]

# Shuffle dates according to permutation the authors applied
shuffled_dates = dates[self.randperm]

timestamps = pd.timedelta_range("0:00:00", "23:59:59", freq="10min", name="time")
assert len(timestamps) == 144

with ZipFile(self.rawdata_paths["PEMS-SF.zip"]) as archive:
    with archive.open("stations_list") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "", " ": "\n"})
        stations = pd.read_csv(
            StringIO(content), names=["station"], dtype="category"
        ).squeeze()

    with archive.open("PEMS_trainlabels") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "\n", " ": "\n"})
        trainlabels = pd.read_csv(
            StringIO(content), names=["label"], dtype="uint8"
        ).squeeze()
        train_dates = shuffled_dates[: len(trainlabels)]
        trainlabels.index = train_dates
    # Check that the labels match with the actual weekdays
    assert all(
        trainlabels.index.day_name() == trainlabels.map(weekdays)
    ), "Labels do not match with dates!"

    with archive.open("PEMS_testlabels") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "", " ": "\n"})
        testlabels = pd.read_csv(
            StringIO(content), names=["label"], dtype="uint8"
        ).squeeze()
        test_dates = shuffled_dates[len(trainlabels) :]
        testlabels.index = test_dates

    # Check that the labels match with the actual weekdays
    assert all(
        testlabels.index.day_name() == testlabels.map(weekdays)
    ), "Labels do not match with dates!"
    assert (
        trainlabels.dtype == testlabels.dtype
    ), "Train and test have different labels!"

    with archive.open("PEMS_train") as file:
        _PEMS_train = []
        for line in file:
            content = line.decode("utf8")
            content = _reformat(content, {"[": "", "]": "", ";": "\n", " ": ","})
            df = pd.read_csv(StringIO(content), header=None).squeeze()
            df = DataFrame(df.values, index=stations, columns=timestamps)
            _PEMS_train.append(df.T)
        PEMS_train = pd.concat(_PEMS_train, keys=train_dates)

    with archive.open("PEMS_test") as file:
        _PEMS_test = []
        for line in file:
            content = line.decode("utf8")
            content = _reformat(content, {"[": "", "]": "", ";": "\n", " ": ","})
            df = pd.read_csv(StringIO(content), header=None).squeeze()
            df = DataFrame(df.values, index=stations, columns=timestamps)
            _PEMS_test.append(df.T)
        PEMS_test = pd.concat(_PEMS_test, keys=test_dates)

PEMS = pd.concat([PEMS_train, PEMS_test]).reset_index()
labels = pd.concat([trainlabels, testlabels]).rename("labels")

mismatches = labels[self.invperm].map(weekdays) != dates.day_name()
assert len(dates[mismatches]) == 0, "Mismatches in label and date weekday!"

PEMS = (
    PEMS.assign(time=PEMS["day"] + PEMS["time"])
    .drop(columns="day")
    .set_index("time")
    .astype("float32")
)

In [None]:
PEMS

In [None]:
trainlabels

In [None]:
dates.to_series().loc["2009-03"]

In [None]:
def _reformat(s: str, replacements: dict) -> str:
    r"""Replace multiple substrings via dict.

    https://stackoverflow.com/a/64500851/9318372
    """
    *_, result = (s := s.replace(c, r) for c, r in replacements.items())  # noqa: F841
    return result

In [None]:
timestamps = pd.timedelta_range("0:00:00", "23:59:59", freq="10min", name="time")

with ZipFile(self.rawdata_paths["PEMS-SF.zip"]) as archive:
    with archive.open("PEMS_train") as file:
        _PEMS_train = []
        for line in file:
            content = line.decode("utf8")
            content = _reformat(content, {"[": "", "]": "", ";": "\n", " ": ","})
            df = pd.read_csv(StringIO(content), names=timestamps).squeeze()
            # df = DataFrame(df.values, columns=timestamps)
            _PEMS_train.append(df.T)
        PEMS_train = pd.concat(_PEMS_train)

In [None]:
shuffled_dates = dates[self.randperm]


with ZipFile(self.rawdata_paths["PEMS-SF.zip"]) as archive:
    with archive.open("stations_list") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "", " ": "\n"})
        stations = pd.read_csv(
            StringIO(content), names=["station"], dtype="category"
        ).squeeze()
        stations = Series(stations)  # make sure it's not TextFileReader

    with archive.open("PEMS_trainlabels") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "\n", " ": "\n"})
        trainlabels = pd.read_csv(
            StringIO(content), names=["labels"], dtype="category"
        ).squeeze()

    with archive.open("PEMS_testlabels") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "", " ": "\n"})
        testlabels = pd.read_csv(
            StringIO(content), names=["labels"], dtype="category"
        ).squeeze()
        # test_dates = shuffled_dates[len(trainlabels) :]
        # testlabels.index = test_dates
        # testlabels = Series(testlabels)  # make sure it's not TextFileReader
    #     train_dates = shuffled_dates[: len(trainlabels)]
    #     trainlabels.index = train_dates
    #     trainlabels = Series(trainlabels)  # make sure it's not TextFileReader
    # # Check that the labels match with the actual weekdays
    # assert all(
    #     trainlabels.index.day_name() == trainlabels.values.map(weekdays)
    # ), "Labels do not match with dates!"

In [None]:
labels.iloc[self.invperm]

In [None]:
dates

In [None]:
self.randperm == ((self.randperm + 1).apply(lambda x: x % len(self.randperm)))

In [None]:
self.randperm

In [None]:
((self.randperm + 1).apply(lambda x: x % len(self.randperm)))

In [None]:
labels = pd.concat([trainlabels, testlabels])

In [None]:
train_dates

In [None]:
trainlabels

In [None]:
shuffled_dates = dates[self.randperm]


with ZipFile(self.rawdata_paths["PEMS-SF.zip"]) as archive:
    with archive.open("stations_list") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "", " ": "\n"})
        stations = pd.read_csv(
            StringIO(content), names=["station"], dtype="category"
        ).squeeze()
        stations = Series(stations)  # make sure it's not TextFileReader

    with archive.open("PEMS_trainlabels") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "\n", " ": "\n"})
        trainlabels = pd.read_csv(
            StringIO(content), names=["labels"], dtype="category"
        ).squeeze()
        train_dates = shuffled_dates[: len(trainlabels)]
        trainlabels.index = train_dates
        trainlabels = Series(trainlabels)  # make sure it's not TextFileReader
    # Check that the labels match with the actual weekdays
    assert all(
        trainlabels.index.day_name() == trainlabels.values.map(weekdays)
    ), "Labels do not match with dates!"

    with archive.open("PEMS_testlabels") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "", " ": "\n"})
        testlabels = pd.read_csv(
            StringIO(content), names=["labels"], dtype="category"
        ).squeeze()
        test_dates = shuffled_dates[len(trainlabels) :]
        testlabels.index = test_dates
        testlabels = Series(testlabels)  # make sure it's not TextFileReader

    # Check that the labels match with the actual weekdays
    assert all(
        testlabels.index.day_name() == testlabels.values.map(weekdays)
    ), "Labels do not match with dates!"
    assert (
        trainlabels.dtype == testlabels.dtype
    ), "Train and test have different labels!"

    with archive.open("PEMS_train") as file:
        _PEMS_train = []
        for line in file:
            content = line.decode("utf8")
            content = _reformat(content, {"[": "", "]": "", ";": "\n", " ": ","})
            df = pd.read_csv(StringIO(content), header=None).squeeze()
            df = DataFrame(df.values, index=stations, columns=timestamps)
            _PEMS_train.append(df.T)
        PEMS_train = pd.concat(_PEMS_train, keys=train_dates)

    with archive.open("PEMS_test") as file:
        _PEMS_test = []
        for line in file:
            content = line.decode("utf8")
            content = _reformat(content, {"[": "", "]": "", ";": "\n", " ": ","})
            df = pd.read_csv(StringIO(content), header=None).squeeze()
            df = DataFrame(df.values, index=stations, columns=timestamps)
            _PEMS_test.append(df.T)
        PEMS_test = pd.concat(_PEMS_test, keys=test_dates)

PEMS = pd.concat([PEMS_train, PEMS_test])

In [None]:
PEMS_train

In [None]:
df