In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Traffic Dataset

There are two files for each fold, the data file and the labels file. We have split the 440 time series between train and test folds, but you are of course free to merge them to consider a different cross validation setting.
- The PEMS_train textfile has 263 lines. Each line describes a time-series provided as a matrix. The matrix syntax is that of Matlab, e.g. [ a b ; c d] is the matrix with row vectors [a b] and [c d] in that order. Each matrix describes the different occupancies rates (963 lines, one for each station/detector) sampled every 10 minutes during the day (144 columns).
- The PEMS_trainlabel text describes, for each day of measurements described above, the day of the week on which the data was sampled, namely an integer between 1 (Mon.) and 7 (Sun.).

- PEMS_test and PEMS_testlabels are formatted in the same way, except that there are 173 test instances.

- The permutation that I used to shuffle the dataset is given in the randperm file. If you need to rearrange the data so that it follows the calendar order, you should merge train and test samples and reorder them using the inverse permutation of randperm.

In [None]:
from tsdm.datasets import BaseDataset

In [None]:
class Traffic(BaseDataset):
    url: str = r"https://archive.ics.uci.edu/ml/machine-learning-databases/00204/"
    info_url: str = r"https://archive.ics.uci.edu/ml/datasets/PEMS-SF"


Traffic.download()

In [None]:
from io import StringIO
from zipfile import ZipFile

import numpy as np
import pandas
from pandas import DataFrame

In [None]:
use_true = True

# The true anomalies were found by iteratively adding them 1 by one,
# Each time checking when the first date was when
# labels[invperm].map(weekdays) didn't match with dates.day_name()
true_dates = pandas.date_range("2008-01-01", "2009-03-26", freq="d", name="day")
true_anomalies = pandas.DatetimeIndex({
    "2008-01-01": "New Year’s Day",
    "2008-01-21": "Martin Luther King Jr. Day",
    "2008-02-18": "Washington’s Birthday",
    "2008-03-09": "anomaly + wrong year",
    "2008-05-26": "Memorial Day",
    "2008-07-04": "Independence Day",
    "2008-09-01": "Labor Day",
    "2008-10-20": "???",
    "2008-11-17": "???",
    "2008-12-07": "???",
    "2009-02-23": "???",
})
true_weekdays = {
    "1": "Sunday",
    "2": "Monday",
    "3": "Tuesday",
    "4": "Wednesday",
    "5": "Thursday",
    "6": "Friday",
    "7": "Saturday",
}


false_dates = pandas.date_range("2008-01-01", "2009-03-30", freq="d", name="day")
false_anomalies = pandas.DatetimeIndex({
    "Jan. 1, 2008": "New Year’s Day",
    "Jan. 21, 2008": "Martin Luther King Jr. Day",
    "Feb. 18, 2008": "Washington’s Birthday",
    "Mar. 9, 2008": "Anomaly day",
    "May 26, 2008": "Memorial Day",
    "Jul. 4, 2008": "Independence Day",
    "Sep. 1, 2008": "Labor Day",
    "Oct. 13, 2008": "Columbus Day",
    "Nov. 11, 2008": "Veterans Day",
    "Nov. 27, 2008": "Thanksgiving",
    "Dec. 25, 2008": "Christmas Day",
    "Jan. 1, 2009": "New Year’s Day",
    "Jan. 19, 2009": "Martin Luther King Jr. Day",
    "Feb. 16, 2009": "Washington’s Birthday",
    "Mar. 8, 2009": "Anomaly day",
})
false_weekdays = {
    "1": "Monday",
    "2": "Tuesday",
    "3": "Wednesday",
    "4": "Thursday",
    "5": "Friday",
    "6": "Saturday",
    "7": "Sunday",
}

dates = true_dates if use_true else false_dates
anomalies = true_anomalies if use_true else false_anomalies
weekdays = true_weekdays if use_true else false_weekdays

mask = dates.isin(anomalies)
assert sum(mask) == len(anomalies)
dates = dates[~mask]

In [None]:
timestamps = pandas.timedelta_range("0:00:00", "23:59:59", freq="10min", name="time")
assert len(timestamps) == 144
timestamps

In [None]:
def _reformat(s: str, replacements: dict) -> str:
    """Replaces substrings with replacments from dict.

    https://stackoverflow.com/a/64500851/9318372
    """
    *_, s = (s := s.replace(c, r) for c, r in replacements.items())
    return s

In [None]:
rawdata_file = Traffic.rawdata_path.joinpath("PEMS-SF.zip")

with ZipFile(rawdata_file) as files:
    with files.open("stations_list") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "", " ": "\n"})
        stations = pandas.read_csv(
            StringIO(content),
            names=["station"],
            dtype="category",
            squeeze=True,
        )

    with files.open("randperm") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "", " ": "\n"})
        randperm = pandas.read_csv(
            StringIO(content),
            names=["randperm"],
            dtype="uint16",
            squeeze=True,
        )
        randperm = randperm - 1  # we use 0-based indexing
        invperm = randperm.copy().argsort()
        invperm.name = "invperm"
        assert (randperm[invperm] == np.arange(len(randperm))).all()

    # Shuffle dates according to permutation the authors applied
    shuffled_dates = dates[randperm]

    with files.open("PEMS_trainlabels") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "\n", " ": "\n"})
        PEMS_trainlabels = pandas.read_csv(
            StringIO(content),
            names=["labels"],
            dtype="category",
            squeeze=True,
        )
        train_dates = shuffled_dates[: len(PEMS_trainlabels)]
        PEMS_trainlabels.index = train_dates

    # Check that the labels match with the actual weekdays
    assert (
        PEMS_trainlabels.index.day_name() == PEMS_trainlabels.values.map(weekdays)
    ).all(), "Labels do not match with dates!"

    with files.open("PEMS_testlabels") as file:
        content = file.read().decode("utf8")
        content = _reformat(content, {"[": "", "]": "", " ": "\n"})
        PEMS_testlabels = pandas.read_csv(
            StringIO(content),
            names=["labels"],
            dtype="category",
            squeeze=True,
        )
        test_dates = shuffled_dates[len(PEMS_trainlabels) :]
        PEMS_testlabels.index = test_dates

    assert (
        PEMS_testlabels.index.day_name() == PEMS_testlabels.values.map(weekdays)
    ).all(), "Labels do not match with dates!"
    assert (
        PEMS_trainlabels.dtype == PEMS_testlabels.dtype
    ), "Train and test have different labels!"
    PEMS_labels = pandas.concat([PEMS_trainlabels, PEMS_testlabels]).rename("labels")

    with files.open("PEMS_train") as file:
        _PEMS_train = []
        for line in file:
            line = line.decode("utf8")
            line = _reformat(line, {"[": "", "]": "", ";": "\n", " ": ","})
            df = pandas.read_csv(
                StringIO(line),
                header=None,
            )
            df = DataFrame(df.values, index=stations, columns=timestamps)
            # df.index = stations
            # df.columns = timestamps
            _PEMS_train.append(df.T)
        PEMS_train = pandas.concat(_PEMS_train, keys=train_dates)

    with files.open("PEMS_test") as file:
        _PEMS_test = []
        for line in file:
            line = line.decode("utf8")
            line = _reformat(line, {"[": "", "]": "", ";": "\n", " ": ","})
            df = pandas.read_csv(
                StringIO(line),
                header=None,
            )
            df = DataFrame(df.values, index=stations, columns=timestamps)
            # df.index = stations
            # df.columns = timestamps
            _PEMS_test.append(df.T)
        PEMS_test = pandas.concat(_PEMS_test, keys=test_dates)

PEMS_labels = pandas.concat([PEMS_trainlabels, PEMS_testlabels])

mismatches = PEMS_labels[invperm].map(weekdays) != dates.day_name()
assert len(dates[mismatches]) == 0, "Mismatches in label and date weekday!"
PEMS = pandas.concat([PEMS_train, PEMS_test])


There are two files for each fold, the data file and the labels file. We have split the 440 time series between train and test folds, but you are of course free to merge them to consider a different cross validation setting.
- The PEMS_train textfile has 263 lines. Each line describes a time-series provided as a matrix. The matrix syntax is that of Matlab, e.g. [ a b ; c d] is the matrix with row vectors [a b] and [c d] in that order. Each matrix describes the different occupancies rates (963 lines, one for each station/detector) sampled every 10 minutes during the day (144 columns).
- The PEMS_trainlabel text describes, for each day of measurements described above, the day of the week on which the data was sampled, namely an integer between 1 (Mon.) and 7 (Sun.).

- PEMS_test and PEMS_testlabels are formatted in the same way, except that there are 173 test instances.

- The permutation that I used to shuffle the dataset is given in the randperm file. If you need to rearrange the data so that it follows the calendar order, you should merge train and test samples and reorder them using the inverse permutation of randperm.

In [None]:
pandas.Series(PEMS_testlabels)

In [None]:
PEMS_labels.reset_index().set_index("day").squeeze()

## Example Plot

In [None]:
import matplotlib.pyplot as plt

station = PEMS.loc[dates]["400000"].reset_index()
station.index = station.day + station.time
station = station.drop(columns=["day", "time"])

fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(16, 9), constrained_layout=True)

# visualize around anomalies
for anomalie, ax in zip(anomalies, axes.flatten()):
    start = pandas.Timestamp(anomalie) - pandas.Timedelta("2d")
    stop = pandas.Timestamp(anomalie) + pandas.Timedelta("2d")
    ts = station.loc[start:stop]
    ax.plot(ts.index.to_numpy(), ts.values)