In [None]:
%load_ext dotenv
%dotenv

In [None]:
from pprint import pprint
from datetime import datetime, timedelta
from tqdm import tqdm
import polars as pl
from eott_dataset import *
from eott_dataset.utils import *
from eott_dataset.etl import *


def confirm():
    return input("continue?").lower() in {"yes", "y"}


# %env set EOTT_DATASET_PATH
# %env set EOTT_OUTPUT_PATH

output_dir = get_output_path()
print("input:", get_dataset_root())
print("output:", output_dir)

In [None]:
pdf = participant_dataframe().collect()

df = pdf.select("pid", "start_time", "rec_time")
starts: dict[int, datetime] = {pid: t for pid, t, _ in df.iter_rows()}
recs: dict[int, timedelta] = {pid: t for pid, _, t in df.iter_rows()}
del df

pdf.write_parquet(output_dir / "participant.parquet", compression="uncompressed")
pdf

In [None]:
if confirm():
    with tqdm(desc="webcam", total=len([*glob_webcam_files()])) as t:
        df = webcam_dataframe(callback=lambda: t.update(1))
        df.sink_parquet(output_dir / "webcam.parquet", compression="uncompressed")

In [None]:
if confirm():
    with tqdm(desc="screen", total=len([*glob_screen_files()])) as t:
        df = screen_dataframe(callback=lambda: t.update(1))
        df.sink_parquet(output_dir / "screen.parquet", compression="uncompressed")

In [None]:
common_columns = ["pid", "record", "timestamp", "study", "duration", "trusted"]
source_columns: dict[Source, list[str]] = {
    Source.MOUSE: ["event", "page", "mouse", "window", "inner", "outer"],
    Source.SCROLL: ["event", "scroll"],
    Source.INPUT: ["event", "caret", "text"],
    Source.TEXT: ["text"],
    Source.LOG: ["event"],
}

with pl.StringCache():
    for source, df in log_dataset().collect().group_by(["source"]):
        source = source[0]
        assert source is not None

        df = df.select(*common_columns, *source_columns[source])
        path = output_dir.joinpath(source).with_suffix(".parquet")

        df.write_parquet(path, compression="uncompressed")

In [None]:
tobii_dataset().collect().write_parquet(output_dir / "tobii.parquet", compression="lz4")

In [None]:
dot_dataset().collect().write_parquet(output_dir / "dot.parquet", compression="uncompressed")

In [None]:
calibration_dataset().collect().write_parquet(output_dir / "calibration.parquet", compression="uncompressed")

In [None]:
trackbox_dataset().collect().write_parquet(output_dir / "trackbox.parquet", compression="uncompressed")

In [None]:
# test files are readable
for path in output_dir.glob("*.parquet"):
    print(path)
    df = pl.read_parquet(path, use_statistics=False)
    pprint(df.schema)
    print()
    del df

In [None]:
df = pl.scan_parquet(output_dir / "screen.parquet")

In [None]:
from io import BytesIO
from decord import VideoReader

entry = df.filter(pid=1).collect().to_dicts()[0]
vr = VideoReader(BytesIO(entry["video"]))

print(vr.get_avg_fps())
del vr