# Data Export
The minimal work to translate and export the data from RealEye and Tobii.

In [None]:
#|default_exp data.export
# TODO(stephen): consider renaming to serde, Serialization and Deserialization

In [2]:
#| export
import polars as pl
from datetime import datetime, timedelta, timezone as UTC
from pathlib import Path
from typing import Callable, Iterable, Iterator, TypeVar, Union


## A fully typed approach to parsing and loading RealEye data

In [3]:
DATA_ROOT = "../../RevChemData/2025-05-14-Data_Export"

TOBII_ROOT = f"{DATA_ROOT}/Tobii-All-Snapshot"
REALEYE_ROOT = f"{DATA_ROOT}/RealEye"

In [4]:
raw_gazes_csv = pl.read_csv(
    f"{REALEYE_ROOT}/raw-gazes.csv",
    columns=["participant_id", "item_id", "test_created_at", "test_raw_data"],
    schema_overrides={"test_created_at": pl.Datetime},
)

In [5]:
raw_gazes_csv.head()

participant_id,item_id,test_created_at,test_raw_data
str,str,datetime[μs],str
"""62e1d6db-e570-4194-b792-1fc4ef…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2025-02-26 20:12:00,"""[[973,475,28,0,1588,1079],[917…"
"""a19074f0-818d-42d2-8d29-290429…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2025-02-26 21:38:52,"""[[902,599,36,0,1484,1077],[938…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-23 18:37:18,"""[[1085,721,36,0,1919,593],[102…"
"""105a3ff2-f0de-4d4b-adaa-feddee…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-23 20:12:28,"""[[1054,543,51,0,897,551],[1052…"
"""7fdc2add-bd15-4a04-b77b-242e5b…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-25 20:53:25,"""[[938,648,44,0,1919,400],[982,…"


In [6]:
#| export
from dataclasses import dataclass
from RevChem.realeye import GazeInfo, iter_parse_raw_data

T_source = TypeVar("T_source")
T_out = TypeVar("T_out")


class Resettable(Iterable[T_out]):
    def __init__(
        self, source_data: T_source, iter_gen: Callable[[T_source], Iterable[T_out]]
    ):
        self._source = source_data
        self.iter_gen = iter_gen

    def __iter__(self) -> Iterable[tuple[int, int, int, float, float, float]]:
        return self.iter_gen(self._source)


def iter_parse_raw_to_GazeInfo(raw_data: str) -> Iterator[GazeInfo]:
    for sextuple in iter_parse_raw_data(raw_data):
        if (length_ := len(sextuple)) > 6:
            # indicates that the RealEye system captured a mouse click. Nothing more.
            print(f"Got a {length_}-tuple: {sextuple = }")
        yield GazeInfo(*sextuple[:6])


def resettable_iter_raw(test_raw_data: str):
    return Resettable(test_raw_data, iter_parse_raw_to_GazeInfo)


@dataclass
class RealEyeRawRow:
    participant_id: str  # the participant being tested
    item_id: str  # the stimulus being shown
    test_created_at: datetime  # when the web browser started running (NOT THE SAME AS THE STIMULUS START TIME)
    raw_data: Iterable[GazeInfo] = None

    def __post_init__(self):
        self.raw_data = resettable_iter_raw(self.raw_data)

    @classmethod
    def from_row_tuples(cls, tuple) -> "RealEyeRawRow":
        return cls(*tuple)

In [7]:
#| hide
def fiddle_realeye_one():
    raw_row: RealEyeRawRow = RealEyeRawRow.from_row_tuples(
        raw_gazes_csv.row(0)
    )
    # first_ = first(list(raw_row.raw_data))
    # print(f"{type(first_) = }")
    # print(first_)
    for sextuple in raw_row.raw_data:
        print(sextuple)
        break

fiddle_realeye_one()

GazeInfo(gaze_point_X=973, gaze_point_Y=475, time_ms_since_start=28, scroll_offset_Y=0, mouse_pos_X=1588, mouse_pos_Y=1079)


In [8]:
#| export
def cumulative_sum(items: list[int|float]) -> list[int|float]:
    """Calculate the cumulative sum up to and including a given index"""
    csum = 0
    res = [None] * len(items)
    for i, item in enumerate(items):
        csum += item
        res[i] = csum
    return res

In [9]:

def test_cumulative_sum():
    t1 = [0, 1, 2, 3]
    r1 = [0, 1, 3, 6]
    assert (a1:=cumulative_sum(t1)) == r1, f"Cumulative sum was incorrect: {a1} != {r1}"

    t2 = [0, 2, 4, 6, 8]
    r2 = [0, 2, 6, 12, 20]
    assert (a2:=cumulative_sum(t2)) == r2, f"Cumulative sum was incorrect: {a2} != {r2}"

    t3 = [1, 3, 5, 7, 9]
    r3 = [1, 4, 9, 16, 25]
    assert (a3:=cumulative_sum(t3)) == r3, f"Cumulative sum was incorrect: {a3} != {r3}"

test_cumulative_sum()

In [10]:
# | export
def raw_gazes_row_to_df(
    row: RealEyeRawRow, # typed row from the CSV. should have few, if any changes, from the raw CSV file. Used for semantic tidyness
    *,
    time_since_name: str = "time_since_start", # new name given to the column that records the time (ms) since this stimulus was shown
    x_name: str = "X", # new name given to the column that captures the X-coordinate of the GazeInfo gaze
    y_name: str = "Y", # new name given to the column that captures the Y-coordinate of the GazeInfo gaze
) -> pl.DataFrame:
    df = pl.DataFrame(
        [
            (
                row.test_created_at,
                gaze_info.time_ms_since_start,
                gaze_info.gaze_point_X,
                gaze_info.gaze_point_Y,
            )
            for gaze_info in row.raw_data
        ],
        schema={
            "test_created_at": pl.Datetime,
            time_since_name: pl.Int32,
            x_name: pl.Int32,
            y_name: pl.Int32,
        },
        orient="row",
    )
    return df.with_columns( # force everything to be UTC because that's what it should be (per docs)
        pl.col("test_created_at").dt.replace_time_zone(time_zone="UTC"),
    )

In [11]:
#| export
from RevChem.common import dt_str_now, group_by

In [12]:
EXPORT_ROOT = Path(DATA_ROOT, "..", f"{dt_str_now()}-python-outputs").resolve()

def run_realeye_df_group_statistics(dfs: list[pl.DataFrame]):
    grouped = group_by(lambda df: df["test_created_at"][0], dfs)
    group_statistics = pl.DataFrame(
        [[group_key, len(group)] for group_key, group in grouped.items()],
        schema=["test_created_at", "n_rows"],
        orient="row",
    )  # .sort("test_created_at")
    with pl.Config(tbl_rows=50):
        display(group_statistics)
    group_statistics.write_csv(EXPORT_ROOT / f"{dt_str_now()}-row_stats.csv")

## Making a RealEye data pipeline

In [13]:
#| export
def realeye_timestamp_to_datetime(
    datetime_col: str = "test_created_at", # column with the recording start datetime
    timestamp_col: str = "time_ms_since_start", # the integer column representing the milliseconds since stimulus exposure
    *,
    overwrite: bool = True, # Whether, the `timestamp_col` will have a datetime type in the result, or (if False) a new column is created
    additional_offset_ms: int = 0, # Additional offset to 
) -> pl.DataFrame:
    """Update `timestamp_col` to be an increasing datetime rather than the default (i64 or int).

    Corrects the `timestamp_col` of `df` to be a `pl.Datetime`, to ease legibility and computation.
    Sums the `timestamp_col` with that of the reference `datetime_col`, incrementing the time forward.

    Returns:
        A dataframe with the described change to the `timestamp_col`
    """
    new_name = timestamp_col if overwrite else f"{timestamp_col}__new_dt_time"
    new_column = pl.col(datetime_col) + pl.duration(milliseconds=timestamp_col) + pl.duration(milliseconds=additional_offset_ms)
    new_column = new_column.alias(new_name)

    return new_column

In [14]:
# | export

def correct_realeye_df_group(
    group_dfs: list[pl.DataFrame], *, time_col: str = "time_since_start"
):
    """In-place mutation to correct the dfs' timing, assuming dfs are aggregated by `test_created_at`"""
    # 1. sum the last|largest millisecond offset from each of the dfs
    group_millisecond_offset_maxes = [df[time_col].max() for df in group_dfs]
    total_milliseconds_since_start = sum(group_millisecond_offset_maxes)
    # 1a. assume we have fully contiguous time series
    # 2. compute the start time: "test_created_at" - total relative milliseconds
    # Using min (not max), because the file outputs take time, even though the recording is done,
    # and the trial is started earlier than the one-ish sec it takes to create the output.
    re_recording_end = min(map(lambda df: df["test_created_at"][0], group_dfs))
    re_recording_start = re_recording_end - timedelta(
        microseconds=total_milliseconds_since_start * 1000
    )
    # 3. roll the relative milliseconds forward for each subsequent DataFrame
    # this is like a "scan" or "cummulative sum"
    # We shift everything "left" one, because the first doesn't need anything additional
    # The second df need only add the first, third df only add the two prior, etc.
    addend_group_millisecond_offsets = [0] + cumulative_sum(
        group_millisecond_offset_maxes[:-1]
    )
    # update the dfs
    for group_member_index in range(len(group_dfs)):
        df = group_dfs[group_member_index].with_columns(
            __temp_start_time=re_recording_start
        )
        group_dfs[group_member_index] = df.with_columns(
            realeye_timestamp_to_datetime(
                datetime_col="__temp_start_time",
                timestamp_col=time_col,
                additional_offset_ms=addend_group_millisecond_offsets[
                    group_member_index
                ],
            )
        ).drop("__temp_start_time")

In [15]:
# | export


# TODO: rename realeye data pipeline function
from RevChem.common import list_concat
from RevChem.tobii import REALEYE_ITEM_IDS, GroupedFrames


def pipeline_raw_realeye_to_timed_dataframe(
    re_raw_df: pl.DataFrame,  # result of pl.read_csv("raw-gazes.csv").
    *,
    do_group_stats_export: bool = False,  # whether compute early stats, write them to EXPORT_ROOT and exit early. Fails if EXPORT_ROOT is undefined.
    debug: bool = False,  # whether we output the first row of each dataframe, to debug what we're looking at.
    dt_timestamp_col: str = "time_since_start",  # name for the datetime timestamp column in the output dataframes
):
    real_eye_rows: list[RealEyeRawRow] = sorted(
        map(RealEyeRawRow.from_row_tuples, re_raw_df.rows()),
        # sorted by item_id, leveraging that list.index(...) -> ordinal position
        key=lambda re_row: REALEYE_ITEM_IDS.index(re_row.item_id),
    )
    # rows to dataframe
    dfs = [
        raw_gazes_row_to_df(row, time_since_name=dt_timestamp_col)
        for row in real_eye_rows
    ]

    if do_group_stats_export:
        run_realeye_df_group_statistics(dfs)
        return

    if debug:
        display(pl.concat([df.head(1) for df in dfs]))

    # group by "creation" time, down to the second, to get first ordering
    # then flatten so we have an overall sequence with subsequences which are in order.
    # Sorting is performed to make sure the test_created_at and test_created_at+1sec are in the correct order.
    dfs = list_concat(
        sorted(
            group_by(lambda df: df["test_created_at"][0], dfs).values(),
            key=lambda dfs: dfs[0]["test_created_at"][0],
        )
    )
    grouped_dfs = GroupedFrames.from_tuples(
        (df["test_created_at"][0].replace(second=0, microsecond=0), df) for df in dfs
    )

    # in particular, several entries are exactly 1 second apart.
    # We assume the later of these entries is "next" chronologically.
    grouped_dfs = GroupedFrames.from_tuples(
        (ts.replace(second=0, microsecond=0), df)
        for ts, group in grouped_dfs.items()
        for df in group
    )

    # group by the minute, order is retained within the group
    # giving us groups that put all entries of a given trial in the right order
    # even if split by a single second, they are collected in the correct stimulus order
    # and are results are output within a minute of each other.
    grouped = GroupedFrames.from_tuples(
        (df["test_created_at"][0].replace(second=0, microsecond=0), df) for df in dfs
    )

    # for group_start_minute, group_dfs in grouped.items():
    #     correct_realeye_df_group(group_dfs, time_col=dt_timestamp_col)

    # now we can apply the timestamp correction algorithm to the groups, functionally.
    # apply the correction, and return the group_dfs so `apply` works and we have a working dict
    grouped = grouped.apply(
        lambda _, group_dfs: (
            correct_realeye_df_group(group_dfs, time_col=dt_timestamp_col),
            group_dfs,
        )[1]
    )

    # lastly, concatenate all of the groups, now that their time columns are fixed
    mapped = GroupedFrames(grouped) #.concat_groups()

    return mapped


In [29]:

pipeline_raw_realeye_to_timed_dataframe(raw_gazes_csv)

Got a 7-tuple: sextuple = [857, 857, 4249, 0, 944, 580, 1]
Got a 7-tuple: sextuple = [1416, 209, 27024, 0, 1919, 1037, 1]
Got a 7-tuple: sextuple = [545, 1043, 3167, 0, 665, 932, 1]
Got a 7-tuple: sextuple = [237, 267, 40087, 0, 1172, 701, 1]
Got a 7-tuple: sextuple = [804, 475, 40287, 0, 1172, 701, 1]


GroupedFrames(num_groups=35, keys=[2024-10-23 18:37:00+00:00, 2024-10-23 20:12:00+00:00, 2024-10-25 20:53:00+00:00, 2024-10-30 16:37:00+00:00, 2024-11-01 20:31:00+00:00, ...])

In [None]:
raw_gazes_csv.write_json()

'[{"participant_id":"62e1d6db-e570-4194-b792-1fc4ef589aff","item_id":"5b8637a5-c1c7-47cc-a1b4-5abe24c5b5aa","test_created_at":"2025-02-26 20:12:00","test_raw_data":"[[973,475,28,0,1588,1079],[917,534,89,0,1588,1079],[921,520,103,0,1588,1079],[1021,484,130,0,1588,1079],[957,623,171,0,1588,1079],[971,612,210,0,1588,1079],[867,557,241,0,1588,1079],[1001,641,269,0,1588,1079],[888,537,303,0,1588,1079],[884,545,329,0,1588,1079],[859,530,369,0,1588,1079],[859,530,404,0,1588,1079],[1000,636,426,0,1588,1079],[1001,611,463,0,1588,1079],[1019,603,489,0,1588,1079],[837,540,524,0,1588,1079],[976,572,556,0,1588,1079],[963,603,587,0,1588,1079],[828,504,616,0,1588,1079],[998,564,654,0,1588,1079],[954,550,687,0,1588,1079],[953,600,715,0,1588,1079],[941,549,747,0,1588,1079],[955,577,783,0,1588,1079],[905,580,816,0,1588,1079],[898,569,854,0,1588,1079],[902,593,889,0,1588,1079],[917,546,920,0,1588,1079],[961,538,955,0,1588,1079],[988,576,983,0,1588,1079],[925,672,1018,0,1588,1079]]"},{"participant_id":"a1

# Fiddling with new things

## Redoing the pairing
Cleaner this time

In [17]:
# full Tobii pipeline is just a few lines of code
# TODO
from RevChem.tobii import (
    COLUMNS_TOBII,
    clean_tsv_file_name,
    filter_tobii_dfs_by_new_years_heuristics,
    read_tobii_individual_tsv,
)


def pipeline_tobii_directory_to_all_dfs(
    directory_of_individual_tobii_sessions: str,
    *,
    columns_subset: list[str] = COLUMNS_TOBII,
    column_renaming: dict[str, str] = {},
) -> list[pl.DataFrame]:
    all_tobii_dfs = [
        read_tobii_individual_tsv(tsv_file)[columns_subset]
        .rename(column_renaming)
        .with_columns(source_tsv=pl.lit(clean_tsv_file_name(tsv_file.name)))
        for tsv_file in Path(directory_of_individual_tobii_sessions).iterdir()
    ]
    all_tobii_dfs = filter_tobii_dfs_by_new_years_heuristics(
        # filter_to_newyear_and_sort_by_timestamp(all_tobii_data_as_exportable)
        all_tobii_dfs
    )

    return all_tobii_dfs

In [18]:
from RevChem.tobii import COLUMN_RENAMING_TOBII_TO_CSV

In [19]:


def test_tobii_pipeline():
    tobii_dfs = pipeline_tobii_directory_to_all_dfs(
        TOBII_ROOT, column_renaming=COLUMN_RENAMING_TOBII_TO_CSV
    )
    print("Example of passing data frame", tobii_dfs[0])
    print(f"Number of dataframes post-pipeline and filter = {len(tobii_dfs)}")

test_tobii_pipeline()

1 dfs failed the 0-th criterion "len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ"
The following previews show the dfs that missed the criteria: len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ
shape: (2, 5)
┌────────────────────────────────┬──────┬──────┬──────────────────┬───────┐
│ timestamp                      ┆ X    ┆ Y    ┆ source_tsv       ┆ count │
│ ---                            ┆ ---  ┆ ---  ┆ ---              ┆ ---   │
│ datetime[μs, UTC]              ┆ i32  ┆ i32  ┆ str              ┆ i32   │
╞════════════════════════════════╪══════╪══════╪══════════════════╪═══════╡
│ 2025-02-27 23:36:21.203 UTC    ┆ null ┆ null ┆ 2025_2_27_Pichu2 ┆ 9934  │
│ 2025-02-27 23:36:21.325819 UTC ┆ null ┆ null ┆ 2025_2_27_Pichu2 ┆ 9934  │
└────────────────────────────────┴──────┴──────┴──────────────────┴───────┘
Example of passing data frame shape: (55_856, 4)
┌────────────────────────────────┬──────┬──────┬──────────────────────┐
│ timestamp              

In [None]:
# | export
import json
import gzip
from typing import Any, List, Tuple


def prepare_chunk_for_json(
    df_120hz: pl.DataFrame, df_30hz_list: List[pl.DataFrame], chunk_id: datetime | str
) -> dict:
    """
    Prepare a 120 Hz DataFrame and its associated 30 Hz fragments for JSON export.

    Args:
        df_120hz: Polars DataFrame with 120 Hz data
        df_30hz_list: List of Polars DataFrames with 30 Hz fragments
        chunk_id: Unique time for which these chunks are relevant.

    Returns:
        Dictionary with structured data for JSON serialization
    """
    # Convert 120 Hz DataFrame to dictionary
    chunk_120hz = df_120hz.write_json()

    # Convert each 30 Hz fragment DataFrame to dictionary
    fragments_30hz = [df.write_json() for df in df_30hz_list]

    # Structure the data
    return {
        "chunk_id": chunk_id if isinstance(chunk_id, str) else chunk_id.isoformat(),
        "hz_120": chunk_120hz,
        "hz_30_fragments": fragments_30hz,
    }


def write_chunks_to_json(
    chunk_associations: List[Tuple[pl.DataFrame, List[pl.DataFrame]]],
    output_path: str,
    *,
    key_func: Callable[[pl.DataFrame, list[pl.DataFrame]], Any],
) -> None:
    """
    Process multiple 120 Hz chunks with their 30 Hz fragments and write to compressed JSON.

    Args:
        chunk_associations: List of tuples, each containing a 120 Hz DataFrame and a list of 30 Hz DataFrames
        output_path: Path to write the compressed JSON file (should end in .json.gz)
        key_func: Function to create a dictionary key from the chunk association.
    """
    # Prepare all chunks
    all_chunks = [
        prepare_chunk_for_json(df_120hz, df_30hz_list, key_func(df_120hz, df_30hz_list))
        for df_120hz, df_30hz_list in chunk_associations
    ]

    # Convert to JSON string
    json_data = json.dumps(all_chunks, ensure_ascii=False)

    # Write to compressed JSON file
    with gzip.open(output_path, "wt", encoding="utf-8") as f:
        f.write(json_data)


"""
Tobii dataframe
│ timestamp                      ┆ X    ┆ Y    ┆ source_tsv   │
│ ---                            ┆ ---  ┆ ---  ┆ ---          │
│ datetime[μs, UTC]              ┆ i32  ┆ i32  ┆ str          │


RealEye dataframe
│ test_created_at         ┆ timestamp                   ┆ X    ┆ Y   │
│ ---                     ┆ ---                         ┆ ---  ┆ --- │
│ datetime[μs, UTC]       ┆ datetime[μs, UTC]           ┆ i32  ┆ i32 │
"""

TOBII_POLARS_SCHEMA = {
    "timestamp": pl.Datetime,
    "X": pl.Int32,
    "Y": pl.Int32,
    "source_tsv": pl.String,
}
REALEYE_POLARS_SCHEMA = {
    "test_created_at": pl.Datetime,
    "timestamp": pl.Datetime,
    "X": pl.Int32,
    "Y": pl.Int32,
}


def read_chunks_from_json(
    input_path: str,
    tobii_schema=TOBII_POLARS_SCHEMA,
    realeye_schema=REALEYE_POLARS_SCHEMA,
) -> list[tuple[pl.DataFrame, list[pl.DataFrame]]]:
    """
    Read a compressed JSON file and reconstruct the 120 Hz and 30 Hz DataFrames.

    Args:
        input_path: Path to the compressed JSON file (e.g., output.json.gz)

    Returns:
        list of tuples, each containing a 120 Hz DataFrame and a list of 30 Hz DataFrames
    """
    # Read and decompress the JSON file
    with gzip.open(input_path, "rt", encoding="utf-8") as f:
        data = json.load(f)

    # Reconstruct DataFrames
    chunk_associations = []
    for chunk in data:
        # Convert 120 Hz JSON string to DataFrame
        df_120hz = pl.read_json(
            bytes(chunk["hz_120"], encoding="UTF-8"), schema_overrides=tobii_schema
        )

        # Convert 30 Hz fragment JSON strings to list of DataFrames
        df_30hz_list = [
            pl.read_json(
                bytes(fragment, encoding="UTF-8"), schema_overrides=realeye_schema
            )
            for fragment in chunk["hz_30_fragments"]
        ]

        # Add to result
        chunk_associations.append((df_120hz, df_30hz_list))

    return chunk_associations


def write_chunks_to_parquet(
    chunk_associations: list[tuple[pl.DataFrame, list[pl.DataFrame]]], output_dir: str
) -> None:
    """
    Write 120 Hz chunks and their 30 Hz fragments to Parquet files.

    Args:
        chunk_associations: list of tuples, each containing a 120 Hz DataFrame and a list of 30 Hz DataFrames
        output_dir: Directory to write Parquet files
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    for idx, (df_120hz, df_30hz_list) in enumerate(chunk_associations):
        # Write 120 Hz DataFrame
        df_120hz.write_parquet(output_path / f"chunk_{idx}_120hz.parquet")

        # Write 30 Hz fragments
        for frag_idx, df_30hz in enumerate(df_30hz_list):
            df_30hz.write_parquet(output_path / f"chunk_{idx}_30hz_{frag_idx}.parquet")


def read_chunks_from_parquet(
    input_dir: str, num_chunks: int
) -> list[tuple[pl.DataFrame, list[pl.DataFrame]]]:
    """
    Read 120 Hz chunks and their 30 Hz fragments from Parquet files.

    Args:
        input_dir: Directory containing Parquet files
        num_chunks: Number of chunks to read (to know how many files to look for)

    Returns:
        list of tuples, each containing a 120 Hz DataFrame and a list of 30 Hz DataFrames
    """
    input_path = Path(input_dir)
    chunk_associations = []
    for idx in range(num_chunks):
        # Read 120 Hz DataFrame
        df_120hz = pl.read_parquet(input_path / f"chunk_{idx}_120hz.parquet")

        # Read 30 Hz fragments (assume up to 9 fragments based on 7±2)
        df_30hz_list = []
        for frag_idx in range(9):  # Adjust range if needed
            fragment_path = input_path / f"chunk_{idx}_30hz_{frag_idx}.parquet"
            if fragment_path.exists():
                df_30hz_list.append(pl.read_parquet(fragment_path))
            else:
                break  # Stop if no more fragments exist

        chunk_associations.append((df_120hz, df_30hz_list))

    return chunk_associations

In [36]:
raw_gazes_csv

participant_id,item_id,test_created_at,test_raw_data
str,str,datetime[μs],str
"""62e1d6db-e570-4194-b792-1fc4ef…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2025-02-26 20:12:00,"""[[973,475,28,0,1588,1079],[917…"
"""a19074f0-818d-42d2-8d29-290429…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2025-02-26 21:38:52,"""[[902,599,36,0,1484,1077],[938…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-23 18:37:18,"""[[1085,721,36,0,1919,593],[102…"
"""105a3ff2-f0de-4d4b-adaa-feddee…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-23 20:12:28,"""[[1054,543,51,0,897,551],[1052…"
"""7fdc2add-bd15-4a04-b77b-242e5b…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-25 20:53:25,"""[[938,648,44,0,1919,400],[982,…"
…,…,…,…
"""d1849f40-e6bc-47fd-b165-c4f545…","""b1641c81-54ff-4370-ab32-8476ef…",2025-05-05 20:21:41,"""[[956,784,33,0,980,583],[941,7…"
"""3510828a-4dcc-4ec4-8145-03d89b…","""b1641c81-54ff-4370-ab32-8476ef…",2025-05-06 17:16:12,"""[[1133,436,32,0,1919,0],[723,6…"
"""ebc78ee8-06ce-45cf-be0e-a8fa21…","""b1641c81-54ff-4370-ab32-8476ef…",2025-05-07 15:10:16,"""[[1021,565,33,0,1919,381],[105…"
"""a3208581-49fe-40fd-a2b3-8272e8…","""b1641c81-54ff-4370-ab32-8476ef…",2025-05-08 15:28:58,"""[[1150,487,26,0,949,576],[1166…"


In [None]:
#| export
def match_tobii_to_realeye_groups(
    tobii_dfs: List[pl.DataFrame],
    realeye_groups: GroupedFrames,
    *,
    tobii_df_time_key: str = "timestamp",  # Key in the Tobii DataFrame for matching,
    # realeye_df_time_key: str = "test_created_at"  # Key in the RealEye DataFrame for matching
):
    times_tobii = [df[tobii_df_time_key][0] for df in tobii_dfs]
    times_realeye = list(realeye_groups.keys())
    assert type(times_realeye[0]) is datetime, "RealEye group keys should be datetimes"
    _logging = False  # set to True to see the matching process
    if _logging:
        print(f"Matching {len(times_tobii)} Tobii timestamps to {len(times_realeye)} RealEye timestamps")

    # tobii is the reference, realeye is under scrutiny
    # 50 sec is the shortest time between Tobii "Start record", skipping validation, and starting RealEye
    _MIN_TIME_DELTA = timedelta(seconds=50)
    found_indices: set[int] = set() # just because I don't want to order it
    pair_indices = []
    for tobii_time in times_tobii:
        current_min_time_diff = timedelta(days=1_000) # more time than is sensible
        found_index = -1
        for i, re_time in enumerate(times_realeye):
            latest_diff = re_time - tobii_time
            if (i not in found_indices) and (_MIN_TIME_DELTA <= latest_diff < current_min_time_diff):
                found_index = i # we've found it. Don't need to log it
                current_min_time_diff = latest_diff
                _logging and print(f"Changed: {current_min_time_diff = }")
        found_indices.add(found_index)
        pair_indices.append(found_index)

    result = [
        (tobii_df.sort(tobii_df_time_key), realeye_groups[times_realeye[re_index]]) 
        for re_index, tobii_df in zip(pair_indices, tobii_dfs)
    ]
    return result


In [43]:
from RevChem.common import date_str_now, datetime_to_stamp, first

EXPORT_ROOT = Path(DATA_ROOT, "..", f"{date_str_now()}-python-outputs").resolve()


# new way
def run_data_export_NEW():
    # Not less code in itself, but the full pipeine is right before your eyes! That's better than call strewn about
    run_start_timestamp_str = datetime_to_stamp()
    print("RE processing...")
    realeye_groups = pipeline_raw_realeye_to_timed_dataframe(
        raw_gazes_csv, dt_timestamp_col="timestamp"
    )
    print("RE done. Sample:")
    print(realeye_groups[first(realeye_groups.keys())])

    print("\n\nTobii processing...")
    tobii_dfs = pipeline_tobii_directory_to_all_dfs(
        TOBII_ROOT, column_renaming=COLUMN_RENAMING_TOBII_TO_CSV
    )
    print("Tobii done. Sample:")
    print(tobii_dfs[0].head(2))

    print("\n\nMatching...")
    matches = match_tobii_to_realeye_groups(tobii_dfs, realeye_groups)

    print("Matching complete. Sample:")
    print(matches[0])
    print("\n\nExporting...")
    EXPORT_ROOT.mkdir(parents=True, exist_ok=True)
    output_file = EXPORT_ROOT / f"{run_start_timestamp_str}-matches-with-TCA.json.gz"
    write_chunks_to_json(
        matches,
        output_file,
        key_func=lambda _, dfs: dfs[0]["test_created_at"][0].replace(
            second=0, microsecond=0
        ),
    )
    print(f"Data written as JSON to path: {output_file}")
    print("Export complete. Thank you for playing.")


run_data_export_NEW()

RE processing...
Got a 7-tuple: sextuple = [857, 857, 4249, 0, 944, 580, 1]
Got a 7-tuple: sextuple = [1416, 209, 27024, 0, 1919, 1037, 1]
Got a 7-tuple: sextuple = [545, 1043, 3167, 0, 665, 932, 1]
Got a 7-tuple: sextuple = [237, 267, 40087, 0, 1172, 701, 1]
Got a 7-tuple: sextuple = [804, 475, 40287, 0, 1172, 701, 1]
RE done. Sample:
[shape: (32, 4)
┌─────────────────────────┬─────────────────────────────┬──────┬─────┐
│ test_created_at         ┆ timestamp                   ┆ X    ┆ Y   │
│ ---                     ┆ ---                         ┆ ---  ┆ --- │
│ datetime[μs, UTC]       ┆ datetime[μs, UTC]           ┆ i32  ┆ i32 │
╞═════════════════════════╪═════════════════════════════╪══════╪═════╡
│ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.015 UTC ┆ 1085 ┆ 721 │
│ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.042 UTC ┆ 1027 ┆ 765 │
│ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.075 UTC ┆ 986  ┆ 776 │
│ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.106 UTC ┆ 1284 ┆ 729 │
│ 2024-