# USHCN Dataset

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
%%time
from tsdm.datasets import USHCN

ds = USHCN()

In [None]:
ds.us_daily

In [None]:
import os

os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
# os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask

In [None]:
import modin.pandas as mpd
import pandas
import pandas as pd

In [None]:
colspecs: dict[str | tuple[str, int], tuple[int, int]] = {
    "COOP_ID": (1, 6),
    "YEAR": (7, 10),
    "MONTH": (11, 12),
    "ELEMENT": (13, 16),
}

for k, i in enumerate(range(17, 258, 8)):
    colspecs |= {
        ("VALUE", k + 1): (i, i + 4),
        ("MFLAG", k + 1): (i + 5, i + 5),
        ("QFLAG", k + 1): (i + 6, i + 6),
        ("SFLAG", k + 1): (i + 7, i + 7),
    }

MFLAGS = pandas.CategoricalDtype(list("BDHKLOPTW"))
QFLAGS = pandas.CategoricalDtype(list("DGIKLMNORSTWXZ"))
SFLAGS = pandas.CategoricalDtype(list("067ABFGHKMNRSTUWXZ"))
ELEMENTS = pandas.CategoricalDtype(("PRCP", "SNOW", "SNWD", "TMAX", "TMIN"))

dtypes = {
    "COOP_ID": pandas.Int32Dtype(),
    "YEAR": pandas.UInt16Dtype(),
    "MONTH": pandas.UInt8Dtype(),
    "ELEMENT": ELEMENTS,
    "VALUE": pandas.Int16Dtype(),
    "MFLAG": MFLAGS,
    "QFLAG": QFLAGS,
    "SFLAG": SFLAGS,
}

# dtypes but with same index as colspec.
dtype = {
    key: dtypes[key[0]] if isinstance(key, tuple) else dtypes[key] for key in colspecs
}

# pandas wants list[tuple[int, int]], 0 indexed, half open intervals.
cspec = [(a - 1, b) for a, b in colspecs.values()]

# per column values to be interpreted as nan
na_values = {("VALUE", k): "-9999" for k in range(1, 32)}
us_daily_path = "/home/rscholz/.tsdm/rawdata/USHCN/us.txt.gz"

In [None]:
%%time
ds = mpd.read_fwf(
    us_daily_path,
    colspecs=cspec,
    names=colspecs,
    na_values=na_values,
    dtype=dtype,
    compression="gzip",
)

In [None]:
ds

In [None]:
%%time
df = pd.read_fwf(
    us_daily_path,
    colspecs=cspec,
    names=colspecs,
    na_values=na_values,
    dtype=dtype,
    compression="gzip",
)

In [None]:
df

In [None]:
ds[("VALUE", 1)].min()

In [None]:
id_cols = ["COOP_ID", "YEAR", "MONTH", "ELEMENT"]
data_cols = [col for col in ds.columns if col not in id_cols]
columns = mpd.DataFrame(data_cols, columns=["VAR", "DAY"])
columns = columns.astype({"VAR": "string", "DAY": "uint8"})
columns = columns.astype("category")

In [None]:
data = ds[data_cols]
data.columns = pandas.MultiIndex.from_frame(columns)

In [None]:
data = data.stack(level="DAY", dropna=True).reset_index(level="DAY")

In [None]:
_dtypes = {k: v for k, v in dtypes.items() if k in data.columns} | {
    "DAY": "int8",
}
data = data.astype(_dtypes)

# recombine data columns with original data
data = ds[id_cols].join(data, how="inner")
data = data.astype(dtypes | {"DAY": "int8"})

In [None]:
data = data.reset_index(drop=True)
datetimes = mpd.to_datetime(data[["YEAR", "MONTH", "DAY"]], errors="coerce")
data = data.drop(columns=["YEAR", "MONTH", "DAY"])
data["time"] = datetimes
data = data.set_index(["COOP_ID", "time"])

In [None]:
data.isna().sum()

In [None]:
data = data.set_index("time", append=True)

In [None]:
data.reindex(
    columns=[
        "ELEMENT",
        "MFLAG",
        "QFLAG",
        "SFLAG",
        "VALUE",
    ]
)

In [None]:
data = data.sort_values(by=["COOP_ID", "time", "ELEMENT"])

In [None]:
%%time
ds = pd.read_fwf(
    us_daily_path,
    colspecs=cspec,
    names=colspecs,
    na_values=na_values,
    dtype=dtype,
    compression="gzip",
)

In [None]:
%%time
id_cols = ["COOP_ID", "YEAR", "MONTH", "ELEMENT"]
data_cols = [col for col in ds.columns if col not in id_cols]
# Turn tuple[VALUE/FLAG, DAY] indices to multi-index:
columns = mpd.MultiIndex.from_tuples(ds[data_cols], names=["VAR", "DAY"])
data = mpd.DataFrame(ds[data_cols], columns=columns)

In [None]:
%%time
DDS = data.stack(level="DAY", dropna=True).reset_index(level="DAY")

In [None]:
data.dtypes.loc["MFLAG"].unique()

In [None]:
ds[id_cols].join(DDS, how="inner")

In [None]:
data

In [None]:
DDS.astype({k: v for k, v in dtypes.items() if k in DDS.columns}).info()

In [None]:
data.dtypes

In [None]:
DDS.MFLAG.unique()

In [None]:
columns = (
    mpd.DataFrame(data_cols, columns=["VAR", "DAY"])
    .astype({"VAR": "string", "DAY": "uint8"})
    .astype("category")
)
columns = mpd.MultiIndex.from_frame(columns)

In [None]:
data.info()

In [None]:
data = mpd.DataFrame(ds[data_cols])
data.columns = columns

In [None]:
%%time
data = mpd.DataFrame(ds[data_cols], columns=columns)

In [None]:
ds

In [None]:
data

In [None]:
?TextFileReader

In [None]:
%%time
df = pd.read_fwf("/home/rscholz/.tsdm/rawdata/USHCN/us.txt.gz", compression="gzip")

In [None]:
%%time
df = mpd.read_fwf("/home/rscholz/.tsdm/rawdata/USHCN/us.txt.gz", compression="gzip")

In [None]:
import importlib.util

In [None]:
importlib.util.find_spec("modin")

In [None]:
importlib.import_module("modin")

In [None]:
ds.download()

In [None]:
ds.us_daily

In [None]:
df = ds._load(key="stations")

In [None]:
df.COMPONENT_1.replace(to_replace="------", value=pandas.NA)

In [None]:
ds.stations

In [None]:
ds.stations

In [None]:
import os

os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask
import ray
from modin import pandas as pd

ray.init()

In [None]:
ds.us_daily

In [None]:
dir(dd)

In [None]:
import os

import dask.dataframe as dd

df = dd.from_pandas(
    ds.us_daily[["YEAR", "MONTH", "DAY"]], name="ts", npartitions=os.cpu_count()
)

In [None]:
time = dd.to_datetime(df, errors="coerce").compute()

In [None]:
ds.stations

In [None]:
ds.us_daily["time"] = time

In [None]:
ds.us_daily

In [None]:
pd.to_datetime(
    ds.us_daily[["YEAR", "MONTH", "DAY"]].iloc[:100_000_000],
    errors="coerce",
    cache=False,
    infer_datetime_format=True,
)

In [None]:
ds.us_daily