In [1]:
import tarfile
import os
from pathlib import Path

import pandas as pd
import datazimmer as dz

from dotenv import load_dotenv

In [2]:
load_dotenv()
raw_dir = Path(os.environ["TKOM_DATA_DIR"])

In [3]:
dcols = ["year", "month", "day", "hour"]

In [5]:
for tarp in raw_dir.glob("*.tar"):
    with tarfile.TarFile(tarp) as tar:
        month_df = (
            pd.concat(
                pd.read_csv(tar.extractfile(memb), compression="gzip") for memb in tar
            )
            .assign(hour=lambda df: pd.to_datetime(df.loc[:, dcols]))
            .rename(columns={"raster_id": "hid"})
            .set_index(["hid", "hour"])
        )
        atts_df = month_df.drop(
            ["Unnamed: 0", "year", "month", "day", "traffic", "min_which_not_zero"], axis=1
        )
        month_id = str(month_df.iloc[0, :].name[1])[:7]
        month_df[["traffic"]].to_parquet(dz.get_raw_data_path(f"{month_id}-traffic.parquet"))
        atts_df.to_parquet(dz.get_raw_data_path(f"{month_id}-atts.parquet"))