# Import dependencies

In [None]:
!pwd
import os
import sys

import numpy as np
import pandas as pd

from datetime import datetime
from datetime import timedelta
import uuid

sys.path.insert(0, os.path.abspath('~/code/arkitektur-poc-tidsserier/src'))
sys.path

# Import Timeseries 

In [None]:
from ssb_timeseries.dataset import Dataset
from ssb_timeseries.properties import SeriesType, Versioning, Temporality
from ssb_timeseries.sample_data import create_df

Environment variables control where time series and logs are stored.

If they are not set, timeseries.io and timeseries.logging will attempt to use default values `/home/jovyan/series/` and `/home/jovyan/logs/`, but this will fail if the directories do not exist.

In [None]:
# For some reason this does not work as expected: 
os.environ.get('TIMESERIES_ROOT', '/home/jovyan/series/')
os.environ.get('LOG_LOCATION', '/home/jovyan/logs')
print(f"{os.environ.get('LOG_LOCATION')} & {os.environ.get('TIMESERIES_ROOT')}")

In [None]:
# ... but this does the trick:
os.environ['TIMESERIES_ROOT'] = '/home/jovyan/series/'
os.environ['LOG_LOCATION'] = '/home/jovyan/logs'

print(f"{os.environ.get('LOG_LOCATION')} & {os.environ.get('TIMESERIES_ROOT')}")

# Create a new set

In [None]:
set_name = "demo-5"
#set_name = f"test-dataset-xxx"
as_of = "2023-12-08"
x = Dataset(name=set_name, data_type=SeriesType.estimate(), as_of_tz=as_of)
print(x)

... note how `SeriesType.estimate()` translates to (or is defined as) `SeriesType(Versioning.AS_OF,Temporality.AT)`.

In [None]:
print(x)
print(x.data.shape)
x.data


Note that if x is a new dataset, the output shows `'data': 0`; meaning there are 0 datapoints defined for `x.as_of_utc` - `x.data.shape = 0`

In [None]:
#tags = {"A": ["JAE", "JA", "JEL", "JE", ""], "B": ["p", "q", "r"], "C": ["x", "y", "z"]}
#tag_values = [value for value in tags.values()]
x.data = create_df(
    ['p', 'q','r'],
    start_date="2022-01-01",
    end_date="2023-01-01",
    freq="H",
)
# and take a look at the data:
print(x)
x.data

Now `x.data` is non-empty, but only in memory. Rerunning init will loose the data and loading the same set does not return anything.

In [None]:
y = Dataset(name=set_name, data_type=SeriesType.estimate(), as_of_tz=as_of)
print(y)

Save to store the data to disk. Now, initialising again with the same parameters will read from disk:

In [None]:
x.save()
# after save, initialising again with the same parameters will read from disk

Take a closer look at the data:

In [None]:
print(x.data.shape)
x.data

... but y is a different object, so is still empty:

In [None]:
print(y.data.shape)

... unless initiated, ie read from disk.

In [None]:
y = Dataset(name=set_name, data_type=SeriesType.estimate(), as_of_tz=as_of)
print(y.data.shape)

In [None]:
all(x == y)

# Slightly more interesting data

In [None]:
tags = {"Mål": ["pris", "mengde"], "Varer": ["melk", "egg", "brød"]}

for d in ["2023-01-01", "2023-02-01", "2023-03-01", "2023-04-01", "2023-05-01", "2023-06-01", "2023-07-01", "2023-08-01", "2023-09-01"]:
    some_data = create_df(
        *[value for value in tags.values()],
        start_date="2024-01-01",
        end_date="2025-01-01",
        freq="M",
    )
    Dataset(name="demo-varehandel", data_type=SeriesType.estimate(), as_of_tz=d, data=some_data*2).save()

In [None]:
februartall = Dataset(name="demo-varehandel", data_type=SeriesType.estimate(), as_of_tz="2023-02-01")
julitall = Dataset(name="demo-varehandel", data_type=SeriesType.estimate(), as_of_tz="2023-07-01")

In [None]:
februartall.data

In [None]:
julitall.data

In [None]:
februartall - julitall

... OK, not that exiting. To emphasize the intended relationship between series meta data and column (set internal series) names: 

In [None]:
tags = {"Mål": ["pris", "mengde"], \
        "Varer": ["melk", "egg", "brød"], \
        "Butikk": ["Rema", "Coop", "Norgesgr"], \
        "Region": ["Trndlg", "Vstlnd", "Agder", "VfldTmrk", "Innlndt", "Viken", "TrmsFmrk"]}

example = create_df(
    *[value for value in tags.values()],
    start_date="2023-12-01",
    
    end_date="2024-01-01",
    freq="M",
)
example.transpose()

# Data access

Since `.data` is a Pandas dataframe, we can access it by way of all the standard methods, eg:

In [None]:
februartall.data.iloc[:, [0,2,]]

In [None]:
februartall.data.iloc[:3,2:]

In [None]:
februartall.data.loc[:9, ['valid_at', 'mengde_egg']]

Since this is standard Pandas, it comes with the standard Pandas quirks:

In [None]:
februartall.data.loc[:, 'pris_brød']

In [None]:
februartall.data.loc[:, ['pris_brød']]

In [None]:
februartall * 100

In [None]:
februartall.data - julitall.data

... for convenience we expose (some) of the same operations for the dataset object 

In [None]:
diff = februartall - julitall 
diff

Note that the above operation returns a dataframe, not a Dataset object.

This should probably be changed. Linear algebra implementation is work in progress.

# Plotting

Basic plotting functionality also works in the usual way.

In [None]:
julitall.data.plot()

The defaults are not good, though, so more sensible parameters are needed.

In [None]:
ax_x0 = julitall.data.plot('valid_at', legend=len(y.data.columns)<9, title=x.name, figsize=(12, 4))

In [None]:
# ... the above are implemented in the convenience method:
ax_x = julitall.plot()

In [None]:
# the axes object may be used in all the usual ways, eg additional parameters
ax_x = julitall.plot(xlabel='')

In [None]:
# plot only a couple of named series
julitall.plot(['pris_melk','pris_brød'])



In [None]:
x.data

Pandas comes with time aggregation features 

In [None]:
x.data.groupby(pd.PeriodIndex(x.data['valid_at'], freq='M')).sum(numeric_only=True)

... which may again be exposed at the Dataset level: 

In [None]:
x.groupby('Q','auto')

But we can also create our own fun stuff. 

In [None]:
x.vectors()

Oooops? What did this do? 

Read the fine print: `p = self.data['p']`

In [None]:
p

In [None]:
p * q * r

Ie, creating a variable per column header and assigning values accordingly.

In [None]:
julitall.vectors('pris')

In [None]:
pris_melk

... but not:

In [None]:
mengde_melk

Be careful! This one can have nasty side effects if column names happen to match to variables or objects that already exist in the workspace:

In [None]:
p = 'some variable with other content'
p

In [None]:
x.vectors()
p

In [None]:
#tags = {"A": ["JAE", "JA", "JEL", "JE", ""], "B": ["indeks"], "C": ["pris", "antall", "vekt"]}
tags_priser = {"Mål": ["pris", "antall", "vekt"], "Varer": ["melk", "brød"]}
tags_mengder = {"Mål": ["antall", "vekt"], "Varer": ["melk", "brød"]}
#tag_values = [value for value in tags.values()]

for d in ["2023-01-01", "2023-02-01", "2023-03-01", "2023-04-01"]:
    some_data = create_df(
        *[value for value in tags.values()],
        start_date="2024-01-01",
        end_date="2025-01-01",
        freq="M",
    )
    x = Dataset(name="demo-1", data_type=SeriesType.estimate(), as_of_tz=d, data=some_data)
    x.save()

y = Dataset(name="demo-1", data_type=SeriesType.estimate(), as_of_tz="2023-02-01")

print(x.as_of_utc)
print(y.as_of_utc)

In [None]:
ax = x.plot()
(x - y).plot('valid_at', legend=len(y.data.columns)<9, title=x.name, figsize=(12, 4))
print(x.data.columns)
col = [xx in x.data.columns]
[c for c in col if c.contains('pris')]
x.data.columns.filter('pris_')
#df.loc[:, df.columns.isin(['nnn', 'mmm', 'yyy', 'zzzzzz'])]


In [None]:
y.as_of_utc += timedelta(days=1) 
y.as_of_utc 


In [None]:
y.data

# Experimental stuff / playground

In [None]:
x.data.resample(level='M', axis=1).sum()

In [None]:
from ssb_timeseries import io 

import glob
#for f in glob.glob('/path/**/*.c', recursive=True):
for f in glob.glob(dataset.io.TIMESERIES_ROOT, recursive=True):
    print(f)