In [1]:
import tsdm
import numpy as np
from tsdm.datasets import Electricity, DatasetMetaClass

In [2]:
vars(Electricity)

In [3]:
%%timeit -n 1 -r 1
Electricity.dataset

In [4]:
%%timeit -n 1 -r 1
Electricity.dataset

In [5]:
Electricity.clean()

In [4]:
Electricity().load()

In [15]:
command = "wget -r -np -nH --cut-dirs 3 -P '/home/rscholz/.tsdm/rawdata/Electricity' https://archive.ics.uci.edu/ml/machine-learning-databases/00321/"

In [16]:
args = ['wget', '-r' ,'-np', '-nH' ,'--cut-dirs=3', "-P='/home/rscholz/.tsdm/rawdata/Electricity'" , "https://archive.ics.uci.edu/ml/machine-learning-databases/00321/"]
args

In [6]:
import subprocess
import shlex

In [7]:
#https://stackoverflow.com/a/52576723/9318372

In [21]:
# -*- coding: utf-8 -*-
from subprocess import PIPE, Popen
import fcntl
import os
import select
import sys

proc = Popen(args, stdin = PIPE, stderr = PIPE, stdout = PIPE, shell=True)

while proc.poll() == None:
    fcntl.fcntl(
            proc.stderr.fileno(),
            fcntl.F_SETFL,
            fcntl.fcntl(proc.stderr.fileno(), fcntl.F_GETFL) | os.O_NONBLOCK,
            )

    buf = ''
    while proc.poll() == None:
        readx_err = select.select([proc.stderr.fileno()], [], [], 0.1)[0]
        if readx_err:
            chunk = proc.stderr.read().decode('utf-8')
            buf += chunk
            if '\n' in buf and '%' in buf and '.' in buf:
                print (buf.strip().split())
                buf = ''
        else:
            break

In [24]:
def run(args):
  with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as process:
    for line in process.stderr:
      print(line.decode('utf8'))

In [25]:
run(args)

In [15]:
with subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=0, shell=True) as p:
    char = p.stdout.read(1)
    while char != b'':
        print(char.decode('UTF-8'), end='', flush=True)
        char = p.stdout.read(1)

In [7]:
from subprocess import Popen, PIPE, STDOUT

def runrealcmd(command):
    process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, close_fds=True)
    for line in iter(process.stdout.readline, b''):
        print(line.rstrip().decode('utf8'))
    process.stdout.close()
    process.wait()


In [12]:
try:
    process = Popen(command, stdout=PIPE, stderr=STDOUT, encoding='utf8', shell=True, bufsize=2, errors='replace')
    while process.poll() is None:
        for lline in process.stdout:
            process.stdout.flush()
            print(lline)
except Exception as exception:
    print(exception)

In [8]:
runrealcmd(command)

In [None]:
# invoke process
process = subprocess.Popen(shlex.split(command),shell=False,stdout=subprocess.PIPE)

# Poll process.stdout to show stdout live
while True:
    output = process.stdout.readline()
    if process.poll() is not None:
        break
    if output:
        print(output.strip())
rc = process.poll()

In [None]:
Electricity().download()

In [None]:
import signal
import subprocess as sp


class VerboseCalledProcessError(sp.CalledProcessError):
    def __str__(self):
        if self.returncode and self.returncode < 0:
            try:
                msg = "Command '%s' died with %r." % (
                    self.cmd, signal.Signals(-self.returncode))
            except ValueError:
                msg = "Command '%s' died with unknown signal %d." % (
                    self.cmd, -self.returncode)
        else:
            msg = "Command '%s' returned non-zero exit status %d." % (
                self.cmd, self.returncode)

        return f'{msg}\n' \
               f'Stdout:\n' \
               f'{self.output}\n' \
               f'Stderr:\n' \
               f'{self.stderr}'


def bash(cmd, print_stdout=True, print_stderr=True):
    proc = sp.Popen(cmd, stderr=sp.PIPE, stdout=sp.PIPE, shell=True, universal_newlines=True,
                    executable='/bin/bash')

    all_stdout = []
    all_stderr = []
    while proc.poll() is None:
        for stdout_line in proc.stdout:
            if stdout_line != '':
                if print_stdout:
                    print(stdout_line, end='')
                all_stdout.append(stdout_line)
        for stderr_line in proc.stderr:
            if stderr_line != '':
                if print_stderr:
                    print(stderr_line, end='', file=sys.stderr)
                all_stderr.append(stderr_line)

    stdout_text = ''.join(all_stdout)
    stderr_text = ''.join(all_stderr)
    if proc.wait() != 0:
        raise VerboseCalledProcessError(proc.returncode, cmd, stdout_text, stderr_text)

In [None]:
bash("echo 'hello world'")

In [None]:
np.arange(3).__class__.__name__

In [None]:
x = tsdm.load_dataset('electricity')
x

In [None]:
observed = np.random.choice([True, False], size=x.shape)
x = x.where(observed)
display(observed, x)

### 2.1 Triplet Format

The data is represented as a set of triplets (time, variable, value). All NaNs are dropped.

In [None]:
dense_x = tsdm.make_dense_triplets(x)
dense_x

### 2.2 Sparse Triplet format

The same as before, but the variable tensor is encoded in a one-hot fashion, and the tensor is stored as a sparse tensor

In [None]:
sparse_x = tsdm.make_sparse_triplets(x)
sparse_x

### 2.3 Masked Format

Here we get 3 tensors:

- x: the original data
- m: a boolean mask, 1: value observed, 0: value not observed (NaN)
- d: time since the channel was last observed

In [None]:
x, m, d = tsdm.make_masked_format(x)
display(x, m, d)

## 3. Visualizing the data

In [None]:
df = tsdm.load_dataset('electricity')
ΔT = np.diff(df.index)
Δt = ΔT[0].astype("timedelta64[m]")
assert np.all(ΔT == Δt)
N, M = df.shape
# remove first year from the data (useless zeros)
span = np.timedelta64(365, "D")//Δt
df = df.iloc[span:]

In [None]:
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt

fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(24, 9), tight_layout=True, sharex='col', sharey=True)
ax[0,0].set_title("24h rolling average")
ax[1,0].set_title("7d rolling average")
selection = np.random.randint(low=0, high=M, size=5)
# selection = [319]

a = np.datetime64('2013-01-01')
b = np.datetime64('2013-02-01')
mask = (df.index >= a) & (df.index <= b)


for k, timedelta in enumerate((Δt, np.timedelta64(24, "h"), np.timedelta64(7, "D"))):
    for l in range(2):
        if l==0:
            data = df.rolling(window=timedelta//Δt, min_periods=1, axis=0).mean()
        if l==1: 
            a = np.datetime64('2013-01-01')
            b = np.datetime64('2013-02-01')
            mask = (df.index >= a) & (df.index <= b)
            data = df[mask].rolling(window=timedelta//Δt, min_periods=1, axis=0).mean()

        for col in data.iloc[:, selection]:
            ax[k,l].plot(data.index, data[col])
        ax[k,l].set_title(F"{timedelta}-rolling average")
        ax[k,l].set_ylabel("electricity consumption in kW")