# PhysioNet 2012

In [None]:
import os
import tarfile
import tempfile
from collections.abc import Mapping
from typing import IO, Any, Literal, TypeAlias

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm.autonotebook import tqdm

from tsdm.datasets.base import MultiTableDataset
from tsdm.encoders import TripletDecoder
from tsdm.types.aliases import PathLike

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

import tsdm

In [None]:
self = tsdm.datasets.PhysioNet2012()
self

In [None]:
self.timeseries

In [None]:
self.timeseries_description

In [None]:
self.timeseries.describe(percentiles=[0.01, 0.99]).T

## Histograms

In [None]:
self.timeseries.hist(bins=20, figsize=(20, 15));

## Marginal densities

In [None]:
self.timeseries.hist(bins=20, figsize=(20, 15), log=True, density=True);

In [None]:
self.metadata.hist(bins=20);

In [None]:
data = [
    # fmt: off
    ("PM2.5", 0,    None, True, True, "μg/m³", "PM2.5 concentration"),
    ("PM10",  0,    None, True, True, "μg/m³", "PM10 concentration" ),
    ("SO2",   0,    None, True, True, "μg/m³", "SO2 concentration"  ),
    ("NO2",   0,    None, True, True, "μg/m³", "NO2 concentration"  ),
    ("CO",    0,    None, True, True, "μg/m³", "CO concentration"   ),
    ("O3",    0,    None, True, True, "μg/m³", "O3 concentration"   ),
    ("TEMP",  None, None, True, True, "°C",    "temperature"        ),
    ("PRES",  0,    None, True, True, "hPa",   "pressure"           ),
    ("DEWP",  None, None, True, True, "°C",    "dew point"          ),
    ("RAIN",  0,    None, True, True, "mm",    "precipitation"      ),
    ("wd",    None, None, True, True, None,    "wind direction"     ),
    ("WSPM",  0,    None, True, True, "m/s",   "wind speed"         ),
    # fmt: on
]

In [None]:
import pyarrow as pa

In [None]:
table = pa.Table.from_pandas(
    VF.reset_index()[["lower", "upper", "lower_included", "upper_included"]]
)

In [None]:
len([arr.combine_chunks() for arr in table])

In [None]:
dir(pa.parquet)

In [None]:
arr = pa.StructArray.from_arrays(
    [arr.combine_chunks() for arr in table], fields=list(interval)
)

In [None]:
t = pa.table([arr], names=["interval"])

In [None]:
pa.parquet.write_table(t, "foo")

In [None]:
interval = pa.struct(
    {
        "lower": pa.float32(),
        "upper": pa.float32(),
        "lower_included": pa.bool_(),
        "upper_included": pa.bool_(),
    }
)
interval