In [None]:
%load_ext dotenv
%dotenv

In [None]:
from pathlib import Path
from datetime import datetime, timedelta
from pprint import pprint
from os import environ
from tqdm import tqdm

import polars as pl
import cv2 as cv

from eott_dataset import *

In [None]:
# %env set EOTT_DATASET_PATH
# %env set EOTT_OUTPUT_PATH

output_dir = Path(environ["EOTT_OUTPUT_PATH"]).expanduser().resolve()
print("output directory:")
print(output_dir)

pdf = read_participant_characteristics()
ps = [Participant.from_dict(**row) for row in pdf.iter_rows(named=True)]

pdf

In [None]:
from contextlib import suppress
from typing import Iterable, Literal



def get_filename(source: str, ext: str = "parquet"):
    return f"{source}.{ext}"



def with_pid_column(df: pl.DataFrame, value: int):
    df = df.with_row_index(name="frame")
    return df.with_columns(pid=pl.lit(value, pl.UInt8))



def log_dataframe(p: Participant):
    df = with_pid_column(p.user_interaction_logs, p.pid)
    return df.select(
        "pid",

        "frame",
        "index",
        "study",
        "event",
        "epoch",
        "time",
    )



def timeline_dataframe(p: Participant):
    df = get_timeline(p).with_columns(pid=pl.lit(p.pid, pl.UInt8))
    return df.select("pid", "frame", "index", "study", "source", "offset")



def scroll_dataframe(p: Participant):
    df = with_pid_column(p.user_interaction_logs, p.pid)
    with suppress(pl.exceptions.ColumnNotFoundError):
        df = df.filter(event="scroll")
        return df.select("pid", "frame", scroll=pl.struct(x="scroll_x", y="scroll_y"))



def mouse_dataframe(p: Participant):
    df = with_pid_column(p.user_interaction_logs, p.pid)

    with suppress(pl.exceptions.ColumnNotFoundError):
        df = df.filter(event="mouse")
        return df.select(
            "pid",
            "frame",

            screen=pl.struct(x="screen_x", y="screen_y"),
            client=pl.struct(x="client_x", y="client_y"),
            window=pl.struct(x="window_x", y="window_y"),
            page=pl.struct(x="page_x", y="page_y"),

            inner=pl.struct(w="inner_width", h="inner_height"),
            outer=pl.struct(w="outer_width", h="outer_height"),

        )


def tobii_dataframe(p: Participant):
    def get_columns():
        sides = ("left", "right")
        names = ("pupil", "gaze_origin", "gaze_point")
        suffixes = (
            "on_display_area",
            "in_user_coordinate_system",
            "validity",
            "diameter",
        )

        suffix_rename = {
            "on_display_area": "display",
            "in_user_coordinate_system": "ucs",
        }

        def get_key(name: str, suffix: str):
            name = name.replace("_", "")
            suffix = suffix_rename.get(suffix, suffix)
            return f"{name}_{suffix}"

        def get_value(name: str, suffix: str):
            return pl.struct(**{side: f"{side}_{name}_{suffix}" for side in sides})

        def check(name: str, suffix: str):
            match (name, suffix):
                case ("pupil", s) if "_" in s:
                    return False
                case (n, "diameter") if n.startswith("gaze"):
                    return False
                case (n, s) if "origin" in n and "display" in s:
                    return False
                case (n, s) if "point" in n and "system" in s:
                    return False
                case _:
                    return True

        return {
            get_key(name, suffix): get_value(name, suffix)
            for name in names
            for suffix in suffixes
            if check(name, suffix)
        }

    df = with_pid_column(p.tobii_gaze_predictions, p.pid)

    df = df.rename(
        {
            f"{name}_time_stamp": name.replace("e_s", "es")
            for name in ("device", "system")
        }

    )
    df = df.with_columns(
        *(
            pl.col(key).arr.to_struct(
                ["x", "y", "z"] if value.width == 3 else ["x", "y"]
            )
            for key, value in df.schema.items()
            if value.base_type() is pl.Array
        )
    )

    return df.select("pid", "frame", **get_columns())


def calibration_dataframe(p: Participant):
    def xy(name: str):
        return pl.struct(**{c: f"{name}_{c}" for c in ("x", "y")})

    def lr(name: str):
        return pl.struct(**{c: f"{name}_{c}" for c in ("left", "right")})

    def pred(name: str):
        return pl.struct(**{c: f"prediction_{c}_{name}" for c in ("x", "y")})

    df = with_pid_column(p.tobii_calibration_points, p.pid)
    return df.select(
        "pid",
        index="frame",
        point=xy("point"),
        validity=lr("validity"),
        left=pred("left"),
        right=pred("right"),
    )


def trackbox_dataframe(p: Participant):
    schema = {
        "position": pl.Struct(
            {"z": pl.Categorical(), "y": pl.Categorical(), "x": pl.Categorical()}
        ),
        "point": pl.Array(pl.Float64, 3),
    }
    data = [
        [{"z": az, "y": ay, "x": ax}, (px, py, pz)]
        for (az, ay, ax), (px, py, pz) in ps[0].tobii_specs[0].items()
    ]

    with pl.StringCache():
        df = pl.DataFrame(data, schema, orient="row")

    return with_pid_column(df, p.pid)


def eott_dataframe(
    name: Literal[
        "log", "mouse", "scroll", "calibration", "trackbox", "tobii", "timeline"
    ],
    it: Iterable[Participant],
):
    if name == "log":
        dfs = [log_dataframe(p) for p in it]
    elif name == "calibration":
        dfs = [calibration_dataframe(p) for p in it]
    elif name == "trackbox":
        dfs = [trackbox_dataframe(p) for p in it]
    elif name == "tobii":
        dfs = [tobii_dataframe(p) for p in it]
    elif name == "timeline":
        dfs = [timeline_dataframe(p) for p in it]
    else:
        fn = mouse_dataframe if name == "mouse" else scroll_dataframe
        dfs = [df for df in map(fn, it) if df is not None]

    return pl.concat(dfs)

In [None]:
dfs: list[pl.DataFrame]
for name in ("log", "mouse", "scroll", "calibration", "trackbox"):
    with pl.StringCache():
        df = eott_dataframe(name, tqdm(ps, name))
    df.write_parquet(output_dir / get_filename(name), compression="lz4")

In [None]:
name = "tobii"
df = eott_dataframe(name, tqdm(ps, name))
df.write_parquet(output_dir / get_filename(name), compression="zstd", compression_level=4)

In [None]:
name = "timeline"
df = eott_dataframe(name, tqdm(ps, name))
df.write_parquet(output_dir / get_filename(name), compression="zstd", compression_level=2)

In [None]:
# test files are readable
for path in output_dir.glob("*.parquet"):
    print(path)
    df = pl.read_parquet(path, use_statistics=False)
    pprint(df.schema)
    print()