In [None]:
%load_ext dotenv
%dotenv

In [None]:
from pprint import pprint
from tqdm import tqdm
import polars as pl
from eott_dataset import *
from eott_dataset.utils import *
from eott_dataset.etl import *


def confirm():
    return input("continue?").lower() in {"yes", "y"}


# %env set EOTT_DATASET_PATH
# %env set EOTT_OUTPUT_PATH

output_dir = get_output_path()
print("input:", get_dataset_root())
print("output:", output_dir)

In [None]:
pdf = participant_dataframe().collect()
pdf.write_parquet(output_dir / "participant.parquet", compression="uncompressed")
pdf

In [None]:
log_dataframe()

In [None]:
from eott_dataset.etl import _tobii_df

p = next(glob_tobii_files())
_tobii_df(p).collect().write_parquet('tobii_test.parquet')

In [None]:
with pl.StringCache():
    dfs = [log_dataframe(p, pid_from_name(p.parent.name)) for p in  glob_log_files()]
    dfs: list[pl.DataFrame]
    for dfs, name in zip(zip(*dfs), ("log", "scroll", "mouse")):
        pl.concat(dfs).write_parquet(output_dir / f"{name}.parquet", compression="lz4")

In [None]:
df = pl.concat([calibration_dataframe(p, pid_from_name(p.parent.name)) for p in glob_specs_files()])
df.collect().write_parquet(output_dir / "calibration.parquet", compression="uncompressed")

In [None]:
df = pl.concat([_tobii_df(p, pid_from_name(p.parent.name)) for p in glob_tobii_files()])
df.collect().write_parquet(output_dir / "tobii.parquet", compression="lz4")

In [None]:
# test files are readable
for path in output_dir.glob("*.parquet"):
    print(path)
    df = pl.read_parquet(path, use_statistics=False)
    pprint(df.schema)
    print()
    del df

In [None]:
df = pl.scan_parquet(output_dir / "screen.parquet")

In [None]:
from io import BytesIO
from decord import VideoReader

entry = df.filter(pid=1).collect().to_dicts()[0]
vr = VideoReader(BytesIO(entry["video"]))

print(vr.get_avg_fps())
del vr

In [None]:
if confirm():
    with tqdm(desc="webcam", total=len([*glob_webcam_files()])) as t:
        df = webcam_dataframe(callback=lambda: t.update(1))
        df.sink_parquet(output_dir / "webcam.parquet", compression="uncompressed")

In [None]:
if confirm():
    with tqdm(desc="screen", total=len([*glob_screen_files()])) as t:
        df = screen_dataframe(callback=lambda: t.update(1))
        df.sink_parquet(output_dir / "screen.parquet", compression="uncompressed")