# Tobii and RealEye record pairing
This notebook is to capture the work of pairing Tobii TSV files with RealEye long records.

In [1]:
#| default_exp tobii

In [2]:
#| export
import polars as pl
# from pathlib import Path

In [3]:

DATA_ROOT = "../../RevChemData/2025-05-14-Data_Export"
!ls {DATA_ROOT}

README.txt         [30m[43mRealEye[m[m            [30m[43mTobii-All-Snapshot[m[m


In [4]:
TOBII_ROOT = f"{DATA_ROOT}/Tobii-All-Snapshot"
REALEYE_ROOT = f"{DATA_ROOT}/RealEye"

In [5]:
!ls {TOBII_ROOT}

[31m1.Realeye1,2,3 2025-02-19 Pikachu.tsv[m[m
[31m1.Realeye1,2,3 2025-02-19_Ailleen_RE.tsv[m[m
[31m1.Realeye1,2,3 2025-02-20_Hannah_tobii1.tsv[m[m
[31m1.Realeye1,2,3 2025-02-20_PinterestFiend.tsv[m[m
[31m1.Realeye1,2,3 2025-02-21 Charmander.tsv[m[m
[31m1.Realeye1,2,3 2025-02-21_Ponyta.tsv[m[m
[31m1.Realeye1,2,3 2025-02-26_Elinor.tsv[m[m
[31m1.Realeye1,2,3 2025-02-26_Raichu.tsv[m[m
[31m1.Realeye1,2,3 2025-02-28_Flute.tsv[m[m
[31m1.Realeye1,2,3 2025-03-03-H.tsv[m[m
[31m1.Realeye1,2,3 2025-03-05_Blastoise.tsv[m[m
[31m1.Realeye1,2,3 2025-03-05_Luna.tsv[m[m
[31m1.Realeye1,2,3 2025-03-06_Ozempic.tsv[m[m
[31m1.Realeye1,2,3 2025-03-07-Celebi.tsv[m[m
[31m1.Realeye1,2,3 2025-03-07-Cyndaquil.tsv[m[m
[31m1.Realeye1,2,3 2025-03-10-Dakrai.tsv[m[m
[31m1.Realeye1,2,3 2025-03-11_Ninetails.tsv[m[m
[31m1.Realeye1,2,3 2025-04-24_Totadile.tsv[m[m
[31m1.Realeye1,2,3 2025-05-05_Butterfree.tsv[m[m
[31m1.Realeye1,2,3 2025-05-05_Stephen-Kathy-Control.tsv

In [6]:
!wc -l {REALEYE_ROOT}/raw-gazes.csv

     351 ../../RevChemData/2025-05-14-Data_Export/RealEye/raw-gazes.csv


## Game plan
I want the timestamps of every record in the raw gazes
- Each row should have this time stamp, trivially.
    - The first row (of a given file) definitely will have it
- If necessary, I'll have to add some milliseconds to the first record and increment to get the start times of each segment of stimulus

I'm going to compare them to the start times of each of the Tobii records, which I can access from the first line of each of those records
- Tobii explicitly records the start time of the recording and milliseconds since the start of the recording


In [7]:
raw_gazes_csv = pl.read_csv(
    f"{REALEYE_ROOT}/raw-gazes.csv",
    columns=["participant_id", "item_id", "test_created_at", "test_raw_data"],
    schema_overrides={"test_created_at": pl.Datetime},
)

In [8]:
# | export

# the old code
# item_ids_in_order = raw_gazes_csv.unique("item_id", maintain_order=True)["item_id"]
REALEYE_ITEM_IDS = [
    "5b8637a5-c1c7-47cc-a1b4-5abe24c5b5aa",
    "83e23c46-4fe9-483b-abb5-4ba3cce6fd9e",
    "f456cacf-9abd-4825-a964-859f1b986c81",
    "d4bdad0c-13ae-4276-b1f4-007c2ad7a495",
    "e9978edb-9cdf-4db7-acef-3a08dfec2991",
    "75ee4147-26ff-48ff-8c88-1743c33b86c9",
    "63dc6493-df28-4aba-9d79-1dedc37d3bca",
    "efff95c6-ae5c-482a-9388-dafb0050dc3b",
    "e5762660-01a6-4e77-8e04-066b96d066c1",
    "b1641c81-54ff-4370-ab32-8476efce616d",
]

In [9]:
# TODO clean further
# only used in this notebook, and for the historical use of the names
item_ids_in_order = item_ids_in_order_LIST = REALEYE_ITEM_IDS
display(item_ids_in_order)

['5b8637a5-c1c7-47cc-a1b4-5abe24c5b5aa',
 '83e23c46-4fe9-483b-abb5-4ba3cce6fd9e',
 'f456cacf-9abd-4825-a964-859f1b986c81',
 'd4bdad0c-13ae-4276-b1f4-007c2ad7a495',
 'e9978edb-9cdf-4db7-acef-3a08dfec2991',
 '75ee4147-26ff-48ff-8c88-1743c33b86c9',
 '63dc6493-df28-4aba-9d79-1dedc37d3bca',
 'efff95c6-ae5c-482a-9388-dafb0050dc3b',
 'e5762660-01a6-4e77-8e04-066b96d066c1',
 'b1641c81-54ff-4370-ab32-8476efce616d']

In [10]:
# Don't bother with `test_id`. It's unique to the participant_id-item_id pair.
# See the following queries for proof
#
# raw_gazes_csv["participant_id"].unique_counts()
# raw_gazes_csv["item_id"].unique_counts()
# raw_gazes_csv["test_id"].unique_counts()

In [11]:
realeye_raw_sorted = raw_gazes_csv.sort(by="test_created_at")
display(realeye_raw_sorted["participant_id"][0])  # first ID, should match Pikachu
display(realeye_raw_sorted.columns)

'5f08f1c2-e9f3-4adb-9fb4-a68b3a1e12a9'

['participant_id', 'item_id', 'test_created_at', 'test_raw_data']

### Utility: Group-by

In [12]:
#| export
from RevChem.common import group_by

## Note - Timing
RealEye uses the UTC timestamp. I don't see this on their documentation, but it appears to agree with the Tobii recording.

In [13]:
realeye_raw_sorted

participant_id,item_id,test_created_at,test_raw_data
str,str,datetime[μs],str
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-23 18:37:18,"""[[1085,721,36,0,1919,593],[102…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""83e23c46-4fe9-483b-abb5-4ba3cc…",2024-10-23 18:37:18,"""[[911,667,51,0,1919,593],[956,…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""f456cacf-9abd-4825-a964-859f1b…",2024-10-23 18:37:18,"""[[1338,648,44,0,1919,593],[131…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""d4bdad0c-13ae-4276-b1f4-007c2a…",2024-10-23 18:37:18,"""[[1031,732,39,0,1919,593],[140…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""e9978edb-9cdf-4db7-acef-3a08df…",2024-10-23 18:37:18,"""[[1025,642,36,0,1919,593],[106…"
…,…,…,…
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""75ee4147-26ff-48ff-8c88-1743c3…",2025-05-14 20:10:48,"""[[949,747,40,0,1919,608],[875,…"
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""63dc6493-df28-4aba-9d79-1dedc3…",2025-05-14 20:10:48,"""[[950,749,25,0,1919,608],[982,…"
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""efff95c6-ae5c-482a-9388-dafb00…",2025-05-14 20:10:48,"""[[431,502,35,0,1919,608],[499,…"
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""e5762660-01a6-4e77-8e04-066b96…",2025-05-14 20:10:48,"""[[1914,505,45,0,1919,608],[166…"


### Make sure that the RealEye groups are concatenated consistently

Ordering should be retained in the group-by. This is just double-checking that.

In [14]:
def first(tup: tuple): return tup[0]
def second(tup: tuple): return tup[1]

In [15]:
# checking that the order is retained of the items
list(
    map(
        second, # second item is item_id
        group_by(lambda df_row: df_row[0], # [0] -> participant_id
         realeye_raw_sorted.iter_rows())[ # key into the a particular participant_id / trial
            "b9682fc8-11b0-4cac-87f9-ab9a98164ecf"
        ],
    )
) == item_ids_in_order_LIST

True

In [16]:
#| export
def tobii_timestamp_to_datetime(
    datetime_col: str = "Recording start datetime UTC", # column with the recording start datetime, UTC or TZ-specific.
    timestamp_col: str = "Recording timestamp", # the integer column representing the microseconds since the recording started
    *,
    overwrite: bool = True, # Whether, the `timestamp_col` will have a datetime type in the result, or (if False) a new column is created
) -> pl.DataFrame:
    """Update `timestamp_col` to be an increasing datetime rather than the default (i64 or int).

    Corrects the `timestamp_col` of `df` to be a `pl.Datetime`, to ease legibility and computation.
    Sums the `timestamp_col` with that of the reference `datetime_col`, incrementing the time forward.

    Returns:
        A dataframe with the described change to the `timestamp_col`
    """
    new_name = timestamp_col if overwrite else f"{timestamp_col}__new_dt_time"
    new_column = pl.col(datetime_col) + pl.duration(microseconds=timestamp_col)
    new_column = new_column.alias(new_name)

    return new_column

In [17]:
# | export
def read_tobii_individual_tsv(
    path_to_tsv: str,
    schema_overrides: dict[str, pl.DataType] = {},
    *,
    change_timestamp_to_datetime: bool = True,  # whether to use this record's derived datetime to calculate per-observation datetimestamps in "Recording timestamp"
) -> pl.DataFrame:
    "Reads a trial-exported Tobii TSV, with the minimal columns that we need"
    individual_tsv = pl.read_csv(
        path_to_tsv,
        separator="\t",
        infer_schema=True,
        schema_overrides={
            "Recording start time UTC": pl.Time,
            "Recording start time": pl.Time,
            "Gaze point X": pl.Int32,
            "Gaze point Y": pl.Int32,
        }
        | schema_overrides,
    )
    start_date_utc = pl.col("Recording date UTC").str.to_date("%m/%d/%Y")
    start_time_utc = pl.col("Recording start time UTC")

    start_dt_utc = (
        start_date_utc.dt.combine(
            pl.time(
                start_time_utc.dt.hour(),
                start_time_utc.dt.minute(),
                start_time_utc.dt.second(),
                start_time_utc.dt.microsecond(),
            )
        )
        .dt.replace_time_zone(time_zone="UTC")
        .alias("start_dt_utc")
    )
    start_time_ = pl.col("Recording start time")

    start_date_ = (
        pl.col("Recording date").str.to_date("%m/%d/%Y")
        if "Recording date" in individual_tsv.columns
        else (
            start_dt_utc.dt.convert_time_zone(time_zone="America/New_York")
            .dt.date()
            .alias("Recording date")
        )
    )
    start_dt_tz = (
        start_date_.dt.combine(
            pl.time(
                start_time_.dt.hour(),
                start_time_.dt.minute(),
                start_time_.dt.second(),
                start_time_.dt.microsecond(),
            )
        )
        .dt.replace_time_zone(time_zone="America/New_York")
        .alias("start_dt_tz")
    )

    result = individual_tsv.with_columns(
        start_date_utc,
        start_date_,
        start_dt_utc,
        start_dt_tz,
    )
    if change_timestamp_to_datetime:
        result = result.with_columns(  # separate call so the columns that we fiddle are guaranteed present
            tobii_timestamp_to_datetime(datetime_col="start_dt_utc")
        )
    # display(plan) when using a LazyFrame to get debugging support
    return result

In [18]:
participant_tsv_pikachu = read_tobii_individual_tsv(
    TOBII_ROOT + "/1.Realeye1,2,3 2025-02-19 Pikachu.tsv",
)

In [67]:
participant_tsv_pikachu

Recording timestamp,Computer timestamp,Sensor,Project name,Export date,Participant name,trials,Recording name,Recording date,Recording date UTC,Recording start time,Recording start time UTC,Recording duration,Timeline name,Recording Fixation filter name,Recording software version,Recording resolution height,Recording resolution width,Recording monitor latency,Average calibration accuracy (mm),Average calibration precision SD (mm),Average calibration precision RMS (mm),Average calibration accuracy (degrees),Average calibration precision SD (degrees),Average calibration precision RMS (degrees),Average calibration accuracy (pixels),Average calibration precision SD (pixels),Average calibration precision RMS (pixels),Average validation accuracy (mm),Average validation precision SD (mm),Average validation precision RMS (mm),Average validation accuracy (degrees),Average validation precision SD (degrees),Average validation precision RMS (degrees),Average validation accuracy (pixels),Average validation precision SD (pixels),Average validation precision RMS (pixels),…,AOI hit [Screen Recording (1) Recording12 - shell dissolved],AOI hit [Screen Recording (1) Recording8 - shell mid],AOI hit [Screen Recording (1) Recording9 - shell mid],AOI hit [Screen Recording (1) Recording10 - shell mid],AOI hit [Screen Recording (1) Recording12 - shell mid],AOI hit [Screen Recording (1) Recording8 - shell whole],AOI hit [Screen Recording (1) Recording9 - shell whole],AOI hit [Screen Recording (1) Recording10 - shell whole],AOI hit [Screen Recording (1) Recording12 - shell whole],AOI hit [ocean 1 - small shell],AOI hit [Screen Recording (1) Recording8 - statement],AOI hit [Screen Recording (1) Recording9 - statement],AOI hit [Screen Recording (1) Recording10 - statement],AOI hit [Screen Recording (1) Recording12 - statement],AOI hit [Screen Recording (1) Recording8 - top],AOI hit [Screen Recording (1) Recording9 - top],AOI hit [Screen Recording (1) Recording10 - top],AOI hit [Screen Recording (1) Recording12 - top],AOI hit [ocean 1 - top],AOI hit [Screen Recording (1) Recording8 - triangle],AOI hit [Screen Recording (1) Recording9 - triangle],AOI hit [Screen Recording (1) Recording10 - triangle],AOI hit [Screen Recording (1) Recording12 - triangle],AOI hit [triangle two - triangle],AOI hit [ocean 1 - whole shell],Client area position X (DACSpx),Client area position Y (DACSpx),Viewport position X,Viewport position Y,Viewport width,Viewport height,Full page width,Full page height,Mouse position X,Mouse position Y,start_dt_utc,start_dt_tz
"datetime[μs, UTC]",i64,str,str,str,str,str,str,date,date,time,time,i64,str,str,str,i64,i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,f64,f64,f64,f64,f64,f64,i64,i64,i64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,"datetime[μs, UTC]","datetime[μs, America/New_York]"
2025-02-19 20:06:09.486 UTC,1654288941931,,"""1.Realeye1,2,3""","""05/14/2025""","""Pikachu""",,"""2025-02-19 Pikachu""",2025-02-19,2025-02-19,15:06:09.486,20:06:09.486,671661,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,3.0,10.3,13.8,0.26,0.92,1.22,10,33,44,6.7,3.5,2.5,0.63,0.33,0.23,21,11,8,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-02-19 20:06:09.486 UTC,2025-02-19 15:06:09.486 EST
2025-02-19 20:06:09.549181 UTC,1654289005112,"""Mouse""","""1.Realeye1,2,3""","""05/14/2025""","""Pikachu""",,"""2025-02-19 Pikachu""",2025-02-19,2025-02-19,15:06:09.486,20:06:09.486,671661,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,3.0,10.3,13.8,0.26,0.92,1.22,10,33,44,6.7,3.5,2.5,0.63,0.33,0.23,21,11,8,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,566,945,2025-02-19 20:06:09.486 UTC,2025-02-19 15:06:09.486 EST
2025-02-19 20:06:09.587351 UTC,1654289043282,"""Mouse""","""1.Realeye1,2,3""","""05/14/2025""","""Pikachu""",,"""2025-02-19 Pikachu""",2025-02-19,2025-02-19,15:06:09.486,20:06:09.486,671661,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,3.0,10.3,13.8,0.26,0.92,1.22,10,33,44,6.7,3.5,2.5,0.63,0.33,0.23,21,11,8,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,566,945,2025-02-19 20:06:09.486 UTC,2025-02-19 15:06:09.486 EST
2025-02-19 20:06:09.607207 UTC,1654289063138,"""Mouse""","""1.Realeye1,2,3""","""05/14/2025""","""Pikachu""",,"""2025-02-19 Pikachu""",2025-02-19,2025-02-19,15:06:09.486,20:06:09.486,671661,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,3.0,10.3,13.8,0.26,0.92,1.22,10,33,44,6.7,3.5,2.5,0.63,0.33,0.23,21,11,8,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,566,944,2025-02-19 20:06:09.486 UTC,2025-02-19 15:06:09.486 EST
2025-02-19 20:06:09.627150 UTC,1654289083081,"""Mouse""","""1.Realeye1,2,3""","""05/14/2025""","""Pikachu""",,"""2025-02-19 Pikachu""",2025-02-19,2025-02-19,15:06:09.486,20:06:09.486,671661,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,3.0,10.3,13.8,0.26,0.92,1.22,10,33,44,6.7,3.5,2.5,0.63,0.33,0.23,21,11,8,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,566,944,2025-02-19 20:06:09.486 UTC,2025-02-19 15:06:09.486 EST
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2025-02-19 20:17:20.959543 UTC,1654960415474,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Pikachu""",,"""2025-02-19 Pikachu""",2025-02-19,2025-02-19,15:06:09.486,20:06:09.486,671661,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,3.0,10.3,13.8,0.26,0.92,1.22,10,33,44,6.7,3.5,2.5,0.63,0.33,0.23,21,11,8,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-02-19 20:06:09.486 UTC,2025-02-19 15:06:09.486 EST
2025-02-19 20:17:20.967876 UTC,1654960423807,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Pikachu""",,"""2025-02-19 Pikachu""",2025-02-19,2025-02-19,15:06:09.486,20:06:09.486,671661,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,3.0,10.3,13.8,0.26,0.92,1.22,10,33,44,6.7,3.5,2.5,0.63,0.33,0.23,21,11,8,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-02-19 20:06:09.486 UTC,2025-02-19 15:06:09.486 EST
2025-02-19 20:17:20.976209 UTC,1654960432140,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Pikachu""",,"""2025-02-19 Pikachu""",2025-02-19,2025-02-19,15:06:09.486,20:06:09.486,671661,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,3.0,10.3,13.8,0.26,0.92,1.22,10,33,44,6.7,3.5,2.5,0.63,0.33,0.23,21,11,8,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-02-19 20:06:09.486 UTC,2025-02-19 15:06:09.486 EST
2025-02-19 20:17:20.980292 UTC,1654960436223,"""Mouse""","""1.Realeye1,2,3""","""05/14/2025""","""Pikachu""",,"""2025-02-19 Pikachu""",2025-02-19,2025-02-19,15:06:09.486,20:06:09.486,671661,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,3.0,10.3,13.8,0.26,0.92,1.22,10,33,44,6.7,3.5,2.5,0.63,0.33,0.23,21,11,8,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1873,992,2025-02-19 20:06:09.486 UTC,2025-02-19 15:06:09.486 EST


In [69]:
participant_tsv_ninetails = read_tobii_individual_tsv(TOBII_ROOT+ "/1.Realeye1,2,3 2025-03-11_Ninetails.tsv")
participant_tsv_ninetails.shape

(61617, 165)

In [70]:
participant_tsv_ninetails

Recording timestamp,Computer timestamp,Sensor,Project name,Export date,Participant name,trials,Recording name,Recording date,Recording date UTC,Recording start time,Recording start time UTC,Recording duration,Timeline name,Recording Fixation filter name,Recording software version,Recording resolution height,Recording resolution width,Recording monitor latency,Average calibration accuracy (mm),Average calibration precision SD (mm),Average calibration precision RMS (mm),Average calibration accuracy (degrees),Average calibration precision SD (degrees),Average calibration precision RMS (degrees),Average calibration accuracy (pixels),Average calibration precision SD (pixels),Average calibration precision RMS (pixels),Average validation accuracy (mm),Average validation precision SD (mm),Average validation precision RMS (mm),Average validation accuracy (degrees),Average validation precision SD (degrees),Average validation precision RMS (degrees),Average validation accuracy (pixels),Average validation precision SD (pixels),Average validation precision RMS (pixels),…,AOI hit [Screen Recording (1) Recording12 - shell dissolved],AOI hit [Screen Recording (1) Recording8 - shell mid],AOI hit [Screen Recording (1) Recording9 - shell mid],AOI hit [Screen Recording (1) Recording10 - shell mid],AOI hit [Screen Recording (1) Recording12 - shell mid],AOI hit [Screen Recording (1) Recording8 - shell whole],AOI hit [Screen Recording (1) Recording9 - shell whole],AOI hit [Screen Recording (1) Recording10 - shell whole],AOI hit [Screen Recording (1) Recording12 - shell whole],AOI hit [ocean 1 - small shell],AOI hit [Screen Recording (1) Recording8 - statement],AOI hit [Screen Recording (1) Recording9 - statement],AOI hit [Screen Recording (1) Recording10 - statement],AOI hit [Screen Recording (1) Recording12 - statement],AOI hit [Screen Recording (1) Recording8 - top],AOI hit [Screen Recording (1) Recording9 - top],AOI hit [Screen Recording (1) Recording10 - top],AOI hit [Screen Recording (1) Recording12 - top],AOI hit [ocean 1 - top],AOI hit [Screen Recording (1) Recording8 - triangle],AOI hit [Screen Recording (1) Recording9 - triangle],AOI hit [Screen Recording (1) Recording10 - triangle],AOI hit [Screen Recording (1) Recording12 - triangle],AOI hit [triangle two - triangle],AOI hit [ocean 1 - whole shell],Client area position X (DACSpx),Client area position Y (DACSpx),Viewport position X,Viewport position Y,Viewport width,Viewport height,Full page width,Full page height,Mouse position X,Mouse position Y,start_dt_utc,start_dt_tz
"datetime[μs, UTC]",i64,str,str,str,str,str,str,date,date,time,time,i64,str,str,str,i64,i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,f64,f64,f64,f64,f64,f64,i64,i64,i64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,"datetime[μs, UTC]","datetime[μs, America/New_York]"
2025-03-11 21:20:24.432 UTC,1040774603140,,"""1.Realeye1,2,3""","""05/14/2025""","""Ninetails""",,"""2025-03-11_Ninetails""",2025-03-11,2025-03-11,17:20:24.432,21:20:24.432,505318,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,1.3,5.6,7.4,0.09,0.41,0.54,4,18,24,6.5,10.7,8.3,0.49,0.8,0.63,21,34,27,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-03-11 21:20:24.432 UTC,2025-03-11 17:20:24.432 EDT
2025-03-11 21:20:24.557179 UTC,1040774728319,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Ninetails""",,"""2025-03-11_Ninetails""",2025-03-11,2025-03-11,17:20:24.432,21:20:24.432,505318,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,1.3,5.6,7.4,0.09,0.41,0.54,4,18,24,6.5,10.7,8.3,0.49,0.8,0.63,21,34,27,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-03-11 21:20:24.432 UTC,2025-03-11 17:20:24.432 EDT
2025-03-11 21:20:24.565513 UTC,1040774736653,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Ninetails""",,"""2025-03-11_Ninetails""",2025-03-11,2025-03-11,17:20:24.432,21:20:24.432,505318,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,1.3,5.6,7.4,0.09,0.41,0.54,4,18,24,6.5,10.7,8.3,0.49,0.8,0.63,21,34,27,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-03-11 21:20:24.432 UTC,2025-03-11 17:20:24.432 EDT
2025-03-11 21:20:24.573848 UTC,1040774744988,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Ninetails""",,"""2025-03-11_Ninetails""",2025-03-11,2025-03-11,17:20:24.432,21:20:24.432,505318,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,1.3,5.6,7.4,0.09,0.41,0.54,4,18,24,6.5,10.7,8.3,0.49,0.8,0.63,21,34,27,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-03-11 21:20:24.432 UTC,2025-03-11 17:20:24.432 EDT
2025-03-11 21:20:24.582182 UTC,1040774753322,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Ninetails""",,"""2025-03-11_Ninetails""",2025-03-11,2025-03-11,17:20:24.432,21:20:24.432,505318,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,1.3,5.6,7.4,0.09,0.41,0.54,4,18,24,6.5,10.7,8.3,0.49,0.8,0.63,21,34,27,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-03-11 21:20:24.432 UTC,2025-03-11 17:20:24.432 EDT
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2025-03-11 21:28:49.554316 UTC,1041279725456,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Ninetails""",,"""2025-03-11_Ninetails""",2025-03-11,2025-03-11,17:20:24.432,21:20:24.432,505318,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,1.3,5.6,7.4,0.09,0.41,0.54,4,18,24,6.5,10.7,8.3,0.49,0.8,0.63,21,34,27,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-03-11 21:20:24.432 UTC,2025-03-11 17:20:24.432 EDT
2025-03-11 21:28:49.562650 UTC,1041279733790,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Ninetails""",,"""2025-03-11_Ninetails""",2025-03-11,2025-03-11,17:20:24.432,21:20:24.432,505318,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,1.3,5.6,7.4,0.09,0.41,0.54,4,18,24,6.5,10.7,8.3,0.49,0.8,0.63,21,34,27,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-03-11 21:20:24.432 UTC,2025-03-11 17:20:24.432 EDT
2025-03-11 21:28:49.570983 UTC,1041279742123,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Ninetails""",,"""2025-03-11_Ninetails""",2025-03-11,2025-03-11,17:20:24.432,21:20:24.432,505318,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,1.3,5.6,7.4,0.09,0.41,0.54,4,18,24,6.5,10.7,8.3,0.49,0.8,0.63,21,34,27,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-03-11 21:20:24.432 UTC,2025-03-11 17:20:24.432 EDT
2025-03-11 21:28:49.579316 UTC,1041279750456,"""Eye Tracker""","""1.Realeye1,2,3""","""05/14/2025""","""Ninetails""",,"""2025-03-11_Ninetails""",2025-03-11,2025-03-11,17:20:24.432,21:20:24.432,505318,"""Timeline1""","""Tobii I-VT (Fixation)""","""1.241.54219""",1080,1920,10.0,1.3,5.6,7.4,0.09,0.41,0.54,4,18,24,6.5,10.7,8.3,0.49,0.8,0.63,21,34,27,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2025-03-11 21:20:24.432 UTC,2025-03-11 17:20:24.432 EDT


## Preparing a clean CSV for each of the Tobii runs

In [20]:
participant_tsv_ninetails[["Recording timestamp", "Recording start time UTC"]]

Recording timestamp,Recording start time UTC
"datetime[μs, UTC]",time
2080-05-20 18:40:48.864 UTC,21:20:24.432
2080-05-20 18:40:48.989179 UTC,21:20:24.432
2080-05-20 18:40:48.997513 UTC,21:20:24.432
2080-05-20 18:40:49.005848 UTC,21:20:24.432
2080-05-20 18:40:49.014182 UTC,21:20:24.432
…,…
2080-05-20 18:49:13.986316 UTC,21:20:24.432
2080-05-20 18:49:13.994650 UTC,21:20:24.432
2080-05-20 18:49:14.002983 UTC,21:20:24.432
2080-05-20 18:49:14.011316 UTC,21:20:24.432


In [21]:
[col for col in participant_tsv_ninetails.columns ]

['Recording timestamp',
 'Computer timestamp',
 'Sensor',
 'Project name',
 'Export date',
 'Participant name',
 'trials',
 'Recording name',
 'Recording date',
 'Recording date UTC',
 'Recording start time',
 'Recording start time UTC',
 'Recording duration',
 'Timeline name',
 'Recording Fixation filter name',
 'Recording software version',
 'Recording resolution height',
 'Recording resolution width',
 'Recording monitor latency',
 'Average calibration accuracy (mm)',
 'Average calibration precision SD (mm)',
 'Average calibration precision RMS (mm)',
 'Average calibration accuracy (degrees)',
 'Average calibration precision SD (degrees)',
 'Average calibration precision RMS (degrees)',
 'Average calibration accuracy (pixels)',
 'Average calibration precision SD (pixels)',
 'Average calibration precision RMS (pixels)',
 'Average validation accuracy (mm)',
 'Average validation precision SD (mm)',
 'Average validation precision RMS (mm)',
 'Average validation accuracy (degrees)',
 'Av

In [22]:
#| export
COLUMNS_TOBII = [
    "Recording timestamp",
    "Gaze point X",
    "Gaze point Y",
]

In [23]:
#| export
from pathlib import Path

#### Coordinate nullity analysis
Looking at the coordinates captured across X- and Y- to assess the damage and reach understanding

In [24]:
participant_tsv_pikachu.select(
    pl.col("Gaze point X").value_counts().alias("Gaze point X value counts")
).unnest("Gaze point X value counts").sort(by="Gaze point X").select(
    pl.col("count").alias("Metacount").value_counts()
).unnest(
    "Metacount"
)

Metacount,count
u32,u32
60,1
36,11
50,5
19,55
51,1
…,…
37,10
79,2
48737,1
73,1


In [25]:
# How often did we look at a given x coordinate
participant_tsv_pikachu.select(
    pl.col("Gaze point X").cast(pl.Int64).value_counts().alias("Gaze point X value counts")
).unnest("Gaze point X value counts").sort(by="count")

Gaze point X,count
i64,u32
324,1
-854,1
1968,1
2145,1
2107,1
…,…
1339,101
982,102
978,103
1341,104


In [26]:
# how many non-null X-values does Pikachu have?
participant_tsv_pikachu.select(
    pl.col("Gaze point X").value_counts().alias("Gaze point X value counts")
).unnest("Gaze point X value counts").sort(by="Gaze point X").filter(
    pl.col("Gaze point X").is_null()
).select(
    non_null=participant_tsv_pikachu.shape[0] - pl.col("count").cast(pl.Int64)
)

non_null
i64
38205


In [27]:
def review_all_trial_null_counts_X():
    record_nullity_counts = []
    for tsv_file in Path(TOBII_ROOT).iterdir():
        print(tsv_file)
        df = read_tobii_individual_tsv(tsv_file)
        n_null_gaze_x = df.select(
            pl.col("Gaze point X").value_counts().alias("Gaze point X value counts")
        ).unnest("Gaze point X value counts").sort(by="Gaze point X").filter(
            pl.col("Gaze point X").is_null()
        ).select(
            n_records=df.shape[0],
            n_null=pl.col("count"),
            non_null=df.shape[0] - pl.col("count"),
        ).with_columns(
            name=pl.lit(" ".join(tsv_file.name.split(" ")[1:]))
        )
        record_nullity_counts.append(n_null_gaze_x)

    nullity_df = pl.DataFrame(record_nullity_counts)
    display(nullity_df)

review_all_trial_null_counts_X()

../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-03-07-Cyndaquil.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-03-06_Ozempic.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-02-26_Raichu.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-03-03-H.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-03-05_Luna.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-02-21 Charmander.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 Recording11.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-02-20_Hannah_tobii1.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 Recording10.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 Recording12.tsv
../../RevChemData/2025-05-14-Data_Export

column_0
object
"shape: (1, 4) ┌───────────┬────────┬──────────┬──────────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪══════════════════════════╡ │ 55856 ┆ 5355 ┆ 50501 ┆ 2025-03-07-Cyndaquil.tsv │ └───────────┴────────┴──────────┴──────────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬────────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪════════════════════════╡ │ 54375 ┆ 4589 ┆ 49786 ┆ 2025-03-06_Ozempic.tsv │ └───────────┴────────┴──────────┴────────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬───────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪═══════════════════════╡ │ 54633 ┆ 5527 ┆ 49106 ┆ 2025-02-26_Raichu.tsv │ └───────────┴────────┴──────────┴───────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬──────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪══════════════════╡ │ 57352 ┆ 4540 ┆ 52812 ┆ 2025-03-03-H.tsv │ └───────────┴────────┴──────────┴──────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬─────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪═════════════════════╡ │ 55644 ┆ 5949 ┆ 49695 ┆ 2025-03-05_Luna.tsv │ └───────────┴────────┴──────────┴─────────────────────┘"
…
"shape: (1, 4) ┌───────────┬────────┬──────────┬──────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪══════════════════════╡ │ 54174 ┆ 4525 ┆ 49649 ┆ 2025-05-07_Eevee.tsv │ └───────────┴────────┴──────────┴──────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬──────────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪══════════════════════════╡ │ 57481 ┆ 5892 ┆ 51589 ┆ 2025-05-08_Relicanth.tsv │ └───────────┴────────┴──────────┴──────────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪════════════════════╡ │ 73002 ┆ 10331 ┆ 62671 ┆ 2025_03_03_Mew.tsv │ └───────────┴────────┴──────────┴────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬────────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪════════════════════════╡ │ 86942 ┆ 48737 ┆ 38205 ┆ 2025-02-19 Pikachu.tsv │ └───────────┴────────┴──────────┴────────────────────────┘"


#### Data Preparation (for real)

In [28]:
#| export
COLUMN_RENAMING_REALEYE_TO_CSV = {
    "timestamp": "timestamp",
    "gaze_point_x": "X",
    "gaze_point_y": "Y"
}
COLUMN_RENAMING_TOBII_TO_CSV = {
    "Recording timestamp": "timestamp",
    "Gaze point X": "X",
    "Gaze point Y": "Y"
}

In [29]:
#| export
from RevChem.realeye import iter_parse_raw_data

In [30]:
# we basically want the following for all the coordiante pairs in the string RealEye gives us
pl.DataFrame([
    {"gaze_point_x": sextuple[0], "gaze_point_y": sextuple[1]}
    for sextuple in iter_parse_raw_data(raw_gazes_csv[15]["test_raw_data"].item())
])

gaze_point_x,gaze_point_y
i64,i64
879,388
908,385
899,450
908,419
871,319
…,…
984,433
958,322
984,421
967,336


In [31]:
#| export
from typing import Iterable
def itersize(any_iter: Iterable) -> int:
    count = 0
    for _ in any_iter: count += 1
    return count

In [32]:
#| export
def enhance_realeye_metadata(realeye_df: pl.DataFrame) -> pl.DataFrame:
    "Decorate the RealEye metadata (typed or not) with addition information"
    # use pl.Struct to dynamically instruct the strong typing in Rust (which drives Polars)
    re_coord_type = pl.Struct({"gaze_point_x": pl.Int64, "gaze_point_y": pl.Int64})
    return realeye_df.with_columns(
        # unpack into dictionaries that function as tuples. Call them coordinate pairs
        coordinate_pairs=pl.col("test_raw_data").map_elements(
            lambda s: [
                {"gaze_point_x": sextuple[0], "gaze_point_y": sextuple[1]}
                for sextuple in iter_parse_raw_data(s)
            ],
            return_dtype=pl.List(re_coord_type),
        ),
        # count the len of the list of tuples RealEye exported into that one row. Call it n_elements
        n_elements=pl.col("test_raw_data").map_elements(
            lambda s: itersize(iter_parse_raw_data(s)), return_dtype=pl.Int32
        ),  # call that the number of elements
    ).drop("test_raw_data")
    # .sort(by="test_created_at") # temporal sort is not necessary. We've sorted the data already.


def read_realeye_raw_gazes_csv(
    path_to_csv: str | Path, decorated: bool = True
) -> pl.DataFrame:
    "Real the RealEye raw-gazes.csv, decorating with field of RealEyeStruct if `decorated` == True"
    raw_csv = pl.read_csv(
        path_to_csv,
        columns=["participant_id", "item_id", "test_created_at", "test_raw_data"],
        schema_overrides={"test_created_at": pl.Datetime},
    ).sort(by="test_created_at")

    if decorated:
        return enhance_realeye_metadata(raw_csv)
    else:
        return raw_csv

In [33]:
# unpack that raw string for all of the RealEye data
raw_gazes_enhanced = enhance_realeye_metadata(realeye_raw_sorted)

In [34]:
# what's exclusively in the new year
raw_gazes_enhanced.filter(pl.col("test_created_at") > pl.date(2025, 1, 1))

participant_id,item_id,test_created_at,coordinate_pairs,n_elements
str,str,datetime[μs],list[struct[2]],i32
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2025-02-19 20:17:11,"[{960,590}, {1041,652}, … {700,726}]",32
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""83e23c46-4fe9-483b-abb5-4ba3cc…",2025-02-19 20:17:11,"[{910,672}, {908,660}, … {924,949}]",158
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""f456cacf-9abd-4825-a964-859f1b…",2025-02-19 20:17:11,"[{718,630}, {753,620}, … {906,646}]",31
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""d4bdad0c-13ae-4276-b1f4-007c2a…",2025-02-19 20:17:11,"[{947,669}, {1015,692}, … {1290,604}]",1867
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""e9978edb-9cdf-4db7-acef-3a08df…",2025-02-19 20:17:11,"[{1079,524}, {1142,527}, … {1187,555}]",32
…,…,…,…,…
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""75ee4147-26ff-48ff-8c88-1743c3…",2025-05-14 20:10:48,"[{949,747}, {875,678}, … {1246,444}]",251
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""63dc6493-df28-4aba-9d79-1dedc3…",2025-05-14 20:10:48,"[{950,749}, {982,690}, … {1287,674}]",1414
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""efff95c6-ae5c-482a-9388-dafb00…",2025-05-14 20:10:48,"[{431,502}, {499,556}, … {1391,454}]",249
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""e5762660-01a6-4e77-8e04-066b96…",2025-05-14 20:10:48,"[{1914,505}, {1664,566}, … {850,310}]",1411


In [35]:
#| export
from typing import NamedTuple
from datetime import datetime

class RealEyeStruct(NamedTuple):
    "Truncated struct containing the minimum necessary information to make RealEye exports useful"
    participant_id: str # the participant being tested
    item_id: str # the stimulus being shown
    test_created_at: datetime # when the web browser started running (NOT THE SAME AS THE STIMULUS START TIME)
    coordinate_pairs: list[dict[str, int]] # [{"gaze_point_x": <int>, "gaze_point_y": <int>}]
    n_elements: int # number of coordinate pairs in <self>.coordinate_pairs

    @classmethod
    def from_tuple(cls, tuple_) -> 'RealEyeStruct':
        return cls(*tuple_)

In [36]:
#| export
from datetime import timedelta, datetime, UTC

def unroll_realeye_dataframe_into_record_dataframes(df: pl.DataFrame):
    """Convert each row of a RealEye-exported CSV into a dataframe of timestamped records
    
    We assume 30 Hz data is given, and so concatenate all dataframes in the order they are ingested/converted
    with a rolling 1/30th of a second added to the time of the first record encountered.
    """
    time_inc = timedelta(seconds=1/30) # 30 Hz data
    dfs = []
    for row in map(RealEyeStruct.from_tuple, df.rows()):
        pseudo_start_time = row.test_created_at # .replace(second=0)
        row_df = pl.DataFrame({
            "timestamp": row.test_created_at.replace(tzinfo=UTC), # we assert the RealEye dataframe is UTC timestamped, per the docs
            "X": [pairs["gaze_point_x"] for pairs in row.coordinate_pairs],
            "Y": [pairs["gaze_point_y"] for pairs in row.coordinate_pairs]
        })
        dfs.append((pseudo_start_time,row_df))

    # group by -> concat all the columns -> count the time with all the data in order
    # NOTE: grouping unique to the minute, which should be shared among RealEye, though the second may differ
    # in particular, several entries are exactly 1 second apart.
    # We assume the later of these entries is "next" chronologically.
    grouped_dfs = group_by(lambda tup: tup[0].replace(second=0, microsecond=0), dfs)
    output_dfs = []
    for start_time, group in grouped_dfs.items():
        df_at_start_time = pl.concat([df for _, df in group])
        start_time = df_at_start_time["timestamp"][0]
        end_time = start_time + time_inc * (df_at_start_time.shape[0] - 1)
        df_with_time_corrected = df_at_start_time.with_columns(
            timestamp=pl.datetime_range(start_time, end_time, time_inc)
        )
        output_dfs.append(df_with_time_corrected)
    
    return output_dfs

In [37]:
#| export
from RevChem.common import list_concat

In [38]:
# | export
import collections
from typing import (
    Callable,
    Dict,
    Generic,
    Iterable,
    List,
    Tuple,
    TypeVar,
)

# Define generic type variables for keys and DataFrames
K = TypeVar("K")  # Key type (e.g., datetime, str, int)
DF = TypeVar("DF", bound=pl.DataFrame)  # Value type, bound to Polars DataFrame
R = TypeVar("R")  # Return type for the apply method


class GroupedFrames(Generic[K, DF]):
    """
    A container for DataFrames grouped by a key.

    This class provides a more expressive and functional alternative to a raw `dict[key, list[DataFrame]]`.
    It is designed to hold groups of Polars DataFrames and offers a dedicated API for common aggregation and
    transformation tasks.

    The internal groups are stored in a dictionary sorted by the group key to ensure predictable iteration order.

    Args:
        data (Dict[K, List[DF]]): A dictionary mapping group keys to lists
                                  of DataFrames.
    """

    def __init__(self, data: Dict[K, List[DF]]):
        # Store data internally, ensuring it's sorted by key for consistency.
        self._groups: Dict[K, List[DF]] = dict(sorted(data.items()))

    @classmethod
    def from_tuples(cls, iterable: Iterable[Tuple[K, DF]]) -> "GroupedFrames[K, DF]":
        """
        Create a GroupedFrames instance from an iterable of (key, DataFrame) tuples.

        This factory method is the primary way to construct a GroupedFrames object.
        It performs the grouping operation itself.

        Args:
            iterable (Iterable[Tuple[K, DF]]): An iterable yielding tuples of (group_key, data_frame).

        Returns:
            GroupedFrames[K, DF]: A new instance with the data grouped by key.
        """
        groups = collections.defaultdict(list)
        for key, frame in iterable:
            groups[key].append(frame)
        return cls(dict(groups))

    def __repr__(self) -> str:
        """Provides a concise representation of the object."""
        num_groups = len(self._groups)
        # Preview the first 5 keys for context
        keys_preview = list(self._groups.keys())[:5]
        keys_str = ", ".join(map(str, keys_preview))
        if num_groups > 5:
            keys_str += ", ..."
        return f"{self.__class__.__name__}(num_groups={num_groups}, keys=[{keys_str}])"

    # --- Dictionary-like interface ---

    def __len__(self) -> int:
        """Returns the number of groups."""
        return len(self._groups)

    def __getitem__(self, key: K) -> List[DF]:
        """Retrieves the list of DataFrames for a given key."""
        return self._groups[key]

    def __iter__(self):
        """Iterates over the group keys."""
        return iter(self._groups)

    def keys(self):
        """Returns the view of group keys."""
        return self._groups.keys()

    def values(self):
        """Returns the view of the lists of DataFrames."""
        return self._groups.values()

    def items(self):
        """Returns a view of the (key, list[DataFrame]) items."""
        return self._groups.items()

    # --- Specialized Methods ---

    def ungroup(self) -> list[tuple[K, DF]]:
        """
        Flattens the groups into a single list of all DataFrames.

        The order of frames is preserved based on the sorted group keys and the original order of frames within each group.

        Returns:
            list[DF]: A single list containing all DataFrames with their keys from all groups.
        """
        return list_concat([(key, frame) for key, frames in self._groups.items() for frame in frames])

    def concat_groups(self, **kwargs) -> Dict[K, DF]:
        """
        Concatenates the DataFrames within each group into a single DataFrame.

        This is useful for consolidating fragmented data for each group key.
        Any keyword arguments are passed directly to `polars.concat`.

        Args:
            **kwargs: Additional keyword arguments for `pl.concat()`, e.g.,
                      `how='diagonal'`.

        Returns:
            Dict[K, DF]: A dictionary mapping each key to its single,
                         concatenated DataFrame.
        """
        return {
            key: pl.concat(frames, **kwargs) for key, frames in self.items() if frames
        }

    def apply(self, func: Callable[[K, list[DF]], R], **kwargs) -> Dict[K, R]:
        """
        Applies a function to each group and returns the results.

        The function receives the group key and the list of DataFrames for that group.

        Args:
            func (Callable[[K, list[DF]], R]): The function to apply.
                                               It takes `(key, frames)` and returns a result.
            **kwargs: Additional keyword arguments for `func`, e.g., `x=2` for `func(a, b, *, x=10)`.


        Returns:
            Dict[K, R]: A dictionary mapping each key to the result of the function.
        """
        return {key: func(key, frames) for key, frames in self.items()}

In [None]:
# | export
def count_group_backwards(
    end_time: datetime,
    group: Iterable[pl.DataFrame],
    *,
    time_inc: timedelta = timedelta(seconds=1 / 30),
) -> pl.DataFrame:
    # (old) debug stuff:
    # print(f"{type(group) = } {type(group[0]) = }")
    concatted_df = pl.concat([df for df in group])
    # print(f"{type(concatted_df) = }")
    end_time = end_time.replace(tzinfo=UTC)  # coerce to UTC
    # assert end_time == end_time_, f"{end_time} != {end_time_}"
    start_time = end_time - time_inc * (concatted_df.shape[0] - 1)

    # overwrites the timestamp column by making a new "timestamp" column with the same name
    df_with_time_corrected = concatted_df.with_columns(
        timestamp=pl.datetime_range(start_time, end_time, time_inc)
    )

    return df_with_time_corrected


def unroll_realeye_df_counting_backwards(df: pl.DataFrame):
    """Convert RealEye Raw record to the unrolled record, with the start_time interpreted as the end time.

    Video evidence suggests that the start_time field is actually more like the "recording completed at" time
    i.e. the end time.
    """
    time_inc = timedelta(seconds=1 / 30)  # 30 Hz data
    dfs_to_group = []
    for row in map(RealEyeStruct.from_tuple, df.rows()):
        # we assert the RealEye dataframe is UTC timestamped, per the docs
        # "timestamp": row.test_created_at.replace(tzinfo=UTC),
        row_df = pl.DataFrame(
            {
                "X": [pairs["gaze_point_x"] for pairs in row.coordinate_pairs],
                "Y": [pairs["gaze_point_y"] for pairs in row.coordinate_pairs],
            }
        )
        dfs_to_group.append((row.test_created_at, row_df))

    # group by -> concat all the columns -> count the time with all the data in order
    # NOTE: grouping unique to the minute, which should be shared among RealEye, though the second may differ
    grouped_dfs = GroupedFrames.from_tuples(dfs_to_group)

    # in particular, several entries are exactly 1 second apart.
    # We assume the later of these entries is "next" chronologically.
    grouped_dfs = GroupedFrames.from_tuples(
        (ts.replace(second=0, microsecond=0), df) for ts, group in grouped_dfs.items()
        for df in group
    )

    output_dfs = grouped_dfs.apply(count_group_backwards, time_inc=time_inc)

    # flatten out the group values, again
    return output_dfs


In [40]:
#| export
def unroll_realeye_df_counting_backwards(df: pl.DataFrame):
    """Convert RealEye Raw record to the unrolled record, with the start_time interpreted as the end time.

    Video evidence suggests that the start_time field is actually more like the "recording completed at" time
    i.e. the end time.
    """
    time_inc = timedelta(seconds=1 / 30)  # 30 Hz data
    dfs = []
    for row in map(RealEyeStruct.from_tuple, df.rows()):
        # we assert the RealEye dataframe is UTC timestamped, per the docs
        # "timestamp": row.test_created_at.replace(tzinfo=UTC),
        row_df = pl.DataFrame({
            "X": [pairs["gaze_point_x"] for pairs in row.coordinate_pairs],
            "Y": [pairs["gaze_point_y"] for pairs in row.coordinate_pairs],
        })
        dfs.append((row.test_created_at, row_df))

    # group by -> concat all the columns -> count the time with all the data in order
    # NOTE: grouping unique to the minute, which should be shared among RealEye, though the second may differ
    # in particular, several entries are exactly 1 second apart.
    # We assume the later of these entries is "next" chronologically.
    grouped_dfs = group_by(lambda tup: tup[0], dfs)

    # regroup
    dfs = list_concat(grouped_dfs.values())
    print(f"{type(dfs) = }, {type(dfs[0]) = }, {type(dfs[0][0]) = }  and {len(dfs) = }")
    grouped_dfs = group_by(lambda tup: tup[0].replace(second=0, microsecond=0), dfs)

    # transform the groups
    output_dfs = []
    for end_time, group in grouped_dfs.items():
        concatted_df = pl.concat([df for _, df in group])
        end_time = end_time.replace(tzinfo=UTC)  # concatted_df["timestamp"][0]
        # assert end_time == end_time_, f"{end_time} != {end_time_}"
        start_time = end_time - time_inc * (concatted_df.shape[0] - 1)

        # overwrites the timestamp column by making a new "timestamp" column with the same name
        df_with_time_corrected = concatted_df.with_columns(
            timestamp=pl.datetime_range(start_time, end_time, time_inc)
        )
        output_dfs.append(df_with_time_corrected)

    # flatten out the group values, again
    return output_dfs

In [41]:
all_realeye_data_as_exportable = unroll_realeye_dataframe_into_record_dataframes(raw_gazes_enhanced)

In [42]:
all_realeye_data_as_exportable_counted_backward = unroll_realeye_df_counting_backwards(raw_gazes_enhanced)

type(dfs) = <class 'list'>, type(dfs[0]) = <class 'tuple'>, type(dfs[0][0]) = <class 'datetime.datetime'>  and len(dfs) = 350


In [43]:
all_realeye_data_as_exportable

[shape: (7_279, 3)
 ┌────────────────────────────────┬──────┬─────┐
 │ timestamp                      ┆ X    ┆ Y   │
 │ ---                            ┆ ---  ┆ --- │
 │ datetime[μs, UTC]              ┆ i64  ┆ i64 │
 ╞════════════════════════════════╪══════╪═════╡
 │ 2024-10-23 18:37:18 UTC        ┆ 1085 ┆ 721 │
 │ 2024-10-23 18:37:18.033333 UTC ┆ 1027 ┆ 765 │
 │ 2024-10-23 18:37:18.066666 UTC ┆ 986  ┆ 776 │
 │ 2024-10-23 18:37:18.099999 UTC ┆ 1284 ┆ 729 │
 │ 2024-10-23 18:37:18.133332 UTC ┆ 1058 ┆ 842 │
 │ …                              ┆ …    ┆ …   │
 │ 2024-10-23 18:41:20.464242 UTC ┆ 847  ┆ -36 │
 │ 2024-10-23 18:41:20.497575 UTC ┆ 661  ┆ 26  │
 │ 2024-10-23 18:41:20.530908 UTC ┆ 881  ┆ 64  │
 │ 2024-10-23 18:41:20.564241 UTC ┆ 898  ┆ 26  │
 │ 2024-10-23 18:41:20.597574 UTC ┆ 1193 ┆ -35 │
 └────────────────────────────────┴──────┴─────┘,
 shape: (7_110, 3)
 ┌────────────────────────────────┬──────┬─────┐
 │ timestamp                      ┆ X    ┆ Y   │
 │ ---                        

In [44]:
all_realeye_data_as_exportable_counted_backward

[shape: (7_279, 3)
 ┌──────┬─────┬────────────────────────────────┐
 │ X    ┆ Y   ┆ timestamp                      │
 │ ---  ┆ --- ┆ ---                            │
 │ i64  ┆ i64 ┆ datetime[μs, UTC]              │
 ╞══════╪═════╪════════════════════════════════╡
 │ 1085 ┆ 721 ┆ 2024-10-23 18:32:57.402426 UTC │
 │ 1027 ┆ 765 ┆ 2024-10-23 18:32:57.435759 UTC │
 │ 986  ┆ 776 ┆ 2024-10-23 18:32:57.469092 UTC │
 │ 1284 ┆ 729 ┆ 2024-10-23 18:32:57.502425 UTC │
 │ 1058 ┆ 842 ┆ 2024-10-23 18:32:57.535758 UTC │
 │ …    ┆ …   ┆ …                              │
 │ 847  ┆ -36 ┆ 2024-10-23 18:36:59.866668 UTC │
 │ 661  ┆ 26  ┆ 2024-10-23 18:36:59.900001 UTC │
 │ 881  ┆ 64  ┆ 2024-10-23 18:36:59.933334 UTC │
 │ 898  ┆ 26  ┆ 2024-10-23 18:36:59.966667 UTC │
 │ 1193 ┆ -35 ┆ 2024-10-23 18:37:00 UTC        │
 └──────┴─────┴────────────────────────────────┘,
 shape: (7_110, 3)
 ┌──────┬─────┬────────────────────────────────┐
 │ X    ┆ Y   ┆ timestamp                      │
 │ ---  ┆ --- ┆ ---           

In [45]:
#| export
def apply(s, transform): return transform(s)

def clean_tsv_file_name(fname: str) -> str:
    from functools import reduce

    transformations = [
        lambda s: s.split("1.Realeye1,2,3")[1].strip(),
        lambda s: s.rstrip(".tsv")
    ]

    return reduce(apply, transformations, fname)

In [46]:
def test_clean_tsv_file_name():
    from fastcore.test import test_eq
    dummy_data = [  # names we need to be able to deal with
        "1.Realeye1,2,3 2025-03-03-H.tsv",
        "1.Realeye1,2,3 2025-03-03_Mew.tsv",
        "1.Realeye1,2,3 2025-03-05 Blastoise.tsv",
    ]

    output_names = [clean_tsv_file_name(data) for data in dummy_data]

    test_eq(all("Realeye" not in name for name in output_names), True)
    test_eq(all(".tsv" not in name for name in output_names), True)


test_clean_tsv_file_name()

In [71]:
# Tobii data
all_tobii_data_as_exportable = [
    read_tobii_individual_tsv(tsv_file)[COLUMNS_TOBII]
    .rename(COLUMN_RENAMING_TOBII_TO_CSV)
    .with_columns(source_tsv=pl.lit(clean_tsv_file_name(tsv_file.name)))
    for tsv_file in Path(TOBII_ROOT).iterdir()
]

In [72]:
len(all_realeye_data_as_exportable), len(all_tobii_data_as_exportable), len(all_realeye_data_as_exportable_counted_backward)

(35, 33, 35)

2025-04-22 11:50 note
Temporary conclusion: one of the RealEye records is a dud. Hard to know which one.

March 7th look suspect for being 1/4 the usual length of a record, resulting in a trial that's only a minute long.
- Every other recording is about 5 minute long.

2025-04-22 13:27 update
- After correcting the joining of the dataframes for the RealEye data, there's now an extra Tobii session
   - it looks like it's the Pichu2 run, which is exceedingly short for some reason.
   - That sorts that out.

This is the end.

2025-06-10 11:42 update
- The pairing algorithm has been sound but the code wasn't being called correctly
- In the analysis, it seems that there is a scarcity of Tobii rows
   - Why? Because RealEye sequences run for about 4 minutes within a ~10 min Tobii trial, but the Tobii sequence ends before then - usually seconds after the RealEye run begins
   - Ours is the task of seing if we have some data loss in the pipeline that this notebook has become.

2025-07-16
Addressed a subtle bug that snuck into the code, by the Tobii start datetime getting doubly added, ending up with the Tobii times that are out of this world e.g. 2080-12-25.
- This doesn't solve the problems that we saw in [02_data_inspect.ipynb](./02_data_inspect.ipynb)

In [49]:
# | export
from datetime import tzinfo, UTC
from typing import TypeVar
from RevChem.common import partition, Predicate


def filter_to_newyear_and_sort_by_timestamp(
    dfs: list[pl.DataFrame],
) -> list[pl.DataFrame]:
    return sorted(
        filter(
            lambda df: (df["timestamp"][0] >= datetime(2025, 1, 1, tzinfo=UTC)),
            dfs,
        ),
        key=lambda df: df["timestamp"][0],
    )


_T = TypeVar("_T")
_Criterion = tuple[Predicate[_T], str]


def _filter_by_criteria_loop(
    dfs: list[pl.DataFrame],
    criteria: list[_Criterion],
    verbose: bool = True,
) -> tuple[list[pl.DataFrame], list[list[pl.DataFrame]]]:

    misses_in_order = []
    for criterion_index, (crit, explanation) in enumerate(criteria):
        dfs, missed = partition(crit, dfs)
        if missed:
            print(f'{len(missed)} dfs failed the {criterion_index}-th criterion "{explanation}"')
            if verbose:
                print(
                    f"The following previews show the dfs that missed the criteria: {explanation}"
                )
                with pl.Config(tbl_rows=10):
                    for miss_df in missed:
                        print(miss_df.head(2).with_columns(count=len(miss_df)))
        misses_in_order.append(missed)

    return dfs, misses_in_order


def filter_tobii_dfs_by_new_years_heuristics(
    dfs: list[pl.DataFrame],
) -> list[pl.DataFrame]:
    """
    Remove dataframes that do not meet our heuristic and discovered criteria for a valid Tobii trial.

    Returns:
        All of the DataFrames that meet all filtering heuristics and invariants
    """
    criteria = [
        (
            (lambda df: len(df) >= 51_000),
            "len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ",
        ),
        (
            (lambda df: all((name not in df["source_tsv"][0] for name in ["Pichu2"]))),
            "found df in black list should not be test recordings",
        ),
    ]
    # If we ever need to do hard debugging, we can name that _ and inspect. Here, we're just releasing the memory.
    dfs, _ = _filter_by_criteria_loop(dfs, criteria)

    return dfs


def filter_realeye_dfs_by_new_years_heuristics(
    dfs: list[pl.DataFrame],
) -> list[pl.DataFrame]:
    """
    Remove dataframes that do not meet our heuristic and discovered criteria for a valid Tobii trial.

    Returns:
        All of the DataFrames that meet all filtering heuristics and invariants
    """
    criteria = [
        (
            (lambda df: len(df) >= 5_000),
            "len(df) >= 5k, corresponding to >= 2.8 minutes of recording @ 30-HZ",
        ),
    ]
    # If we ever need to do hard debugging, we can name that _ and inspect. Here, we're just releasing the memory.
    dfs, _ = _filter_by_criteria_loop(dfs, criteria)

    return dfs

In [50]:
all_realeye_new_year_in_order = filter_realeye_dfs_by_new_years_heuristics(
    filter_to_newyear_and_sort_by_timestamp(all_realeye_data_as_exportable)
)
all_tobii_new_year_in_order = filter_tobii_dfs_by_new_years_heuristics(
    filter_to_newyear_and_sort_by_timestamp(all_tobii_data_as_exportable)
)
all_realeye_new_year_in_order_counted_backward = filter_realeye_dfs_by_new_years_heuristics(
    filter_to_newyear_and_sort_by_timestamp(all_realeye_data_as_exportable_counted_backward)
)

1 dfs failed the 0-th criterion "len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ"
The following previews show the dfs that missed the criteria: len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ
shape: (2, 5)
┌────────────────────────────────┬──────┬──────┬──────────────────┬───────┐
│ timestamp                      ┆ X    ┆ Y    ┆ source_tsv       ┆ count │
│ ---                            ┆ ---  ┆ ---  ┆ ---              ┆ ---   │
│ datetime[μs, UTC]              ┆ i32  ┆ i32  ┆ str              ┆ i32   │
╞════════════════════════════════╪══════╪══════╪══════════════════╪═══════╡
│ 2080-04-26 23:12:42.406 UTC    ┆ null ┆ null ┆ 2025_2_27_Pichu2 ┆ 9934  │
│ 2080-04-26 23:12:42.528819 UTC ┆ null ┆ null ┆ 2025_2_27_Pichu2 ┆ 9934  │
└────────────────────────────────┴──────┴──────┴──────────────────┴───────┘


## On the Length mismatch problem i.e. time series not being alignable without hacky code and no sense-making
We know this from the analysis alone
* a **Tobii** recording **should be at least 51k data points** -> @120HZ is **7 minutes**. 
* A **RealEye** recording **should be at least 7k data points** -> @30HZ is **3.9 minutes**. With a minute for each stimulus, and about 40 seconds of calibration this makes sense.  That said, I think it's supposed to longer but whatever. We'll have to check the audio recordings to have that for sure.

Why don't we do this manually?
* Because we have to define the RealEye association in code anyway, because it has no meaningful identifier.
* Might as well document the heuristic by which they are associated, *in said code*.

In [66]:
len(all_realeye_new_year_in_order), len(all_tobii_new_year_in_order), len(all_realeye_new_year_in_order_counted_backward)

(28, 32, 28)

In [52]:
list(
    pl.concat(
        [re, tob.rename({"timestamp": "ts_tobii", "X": "X_tobii", "Y": "Y_tobii"})],
        how="horizontal",
    )["ts_tobii", "timestamp", "X_tobii", "Y_tobii", "X", "Y", "source_tsv"]
    for (re, tob) in zip(
        [df.head(1) for df in all_realeye_new_year_in_order],
        [df.head(1) for df in all_tobii_new_year_in_order],
    )
)

[shape: (1, 7)
 ┌───────────────────────────┬─────────────────────────┬─────────┬─────────┬─────┬─────┬────────────┐
 │ ts_tobii                  ┆ timestamp               ┆ X_tobii ┆ Y_tobii ┆ X   ┆ Y   ┆ source_tsv │
 │ ---                       ┆ ---                     ┆ ---     ┆ ---     ┆ --- ┆ --- ┆ ---        │
 │ datetime[μs, UTC]         ┆ datetime[μs, UTC]       ┆ i32     ┆ i32     ┆ i64 ┆ i64 ┆ str        │
 ╞═══════════════════════════╪═════════════════════════╪═════════╪═════════╪═════╪═════╪════════════╡
 │ 2079-08-20 17:32:04.154   ┆ 2025-02-19 20:17:11 UTC ┆ null    ┆ null    ┆ 960 ┆ 590 ┆ Recording8 │
 │ UTC                       ┆                         ┆         ┆         ┆     ┆     ┆            │
 └───────────────────────────┴─────────────────────────┴─────────┴─────────┴─────┴─────┴────────────┘,
 shape: (1, 7)
 ┌──────────────────────────┬─────────────────────────┬─────────┬─────────┬──────┬─────┬────────────┐
 │ ts_tobii                 ┆ timestamp            

In-order zip-based pairing is inaccurate and unintelligent.

Algorithm that should solve it simply

0. Prep: dfs -> tuples of starting time and df, to ease the comparison function
    - alt: write a comparator that accesses that property
1. Brutal force
    1. For each tobii session, find the closest RealEye session
    2. For logging purposes, articulate the `pl.Duration` or `datetime.TimeDelta`: `end - start` for RealEye
2. More intelligent
    1. Pop found indices from the available indices to search, so you don't double-select a `DataFrame`
    2. Use the `source_tsv` property as a key into a dictionary, to store the found DataFrame once its found
        - or its index. Doesn't really matter


In [53]:
#| export
from datetime import datetime, timedelta
import random

def generate_random_datetimes(start, count, max_minutes_range):
    """
    # Function to generate random datetime list
    # Parameters: start datetime, number of datetimes, max minutes range
    """
    return [start + timedelta(minutes=random.randint(0, max_minutes_range)) for _ in range(count)]


In [54]:
#| hide
def test_datetime_algorithms():
    a_times = sorted(generate_random_datetimes(datetime(2025, 2, 1), 10, 1440 * 5))
    b_times = sorted(generate_random_datetimes(datetime(2025, 2, 1), 5, 1440 * 5))
    display(a_times, b_times)

test_datetime_algorithms()

[datetime.datetime(2025, 2, 1, 9, 46),
 datetime.datetime(2025, 2, 1, 13, 41),
 datetime.datetime(2025, 2, 2, 19, 51),
 datetime.datetime(2025, 2, 2, 21, 40),
 datetime.datetime(2025, 2, 3, 10, 6),
 datetime.datetime(2025, 2, 5, 0, 12),
 datetime.datetime(2025, 2, 5, 0, 12),
 datetime.datetime(2025, 2, 5, 3, 7),
 datetime.datetime(2025, 2, 5, 4, 53),
 datetime.datetime(2025, 2, 5, 16, 8)]

[datetime.datetime(2025, 2, 3, 5, 33),
 datetime.datetime(2025, 2, 3, 13, 11),
 datetime.datetime(2025, 2, 4, 3, 59),
 datetime.datetime(2025, 2, 4, 11, 38),
 datetime.datetime(2025, 2, 5, 17, 1)]

In [55]:
#| export
def find_tobii_realeye_df_pairs(
    tobii_dfs: list[pl.DataFrame], realeye_dfs: list[pl.DataFrame],
    *,
    _logging: bool = False
) -> list[tuple[pl.DataFrame, pl.DataFrame]]:
    """
    Algorithm

    0. Prep: dfs -> tuples of starting time and df, to ease the comparison function
        - alt: write a comparator that accesses that property
    1. Brutal force
        1. For each tobii session, find the closest RealEye session
        2. For logging purposes, articulate the `pl.Duration` or `datetime.TimeDelta`: `end - start` for RealEye
    2. More intelligent
        1. Pop found indices from the available indices to search, so you don't double-select a `DataFrame`
        2. Use the `source_tsv` property as a key into a dictionary, to store the found DataFrame once its found
            - or its index. Doesn't really matter
    """
    times_tobii = [df["timestamp"][0] for df in tobii_dfs]
    times_realeye = [df["timestamp"][0] for df in realeye_dfs]
    # tobii is the reference, realeye is under scrutiny
    # 50 sec is the shortest time between Tobii "Start record", skipping validation, and starting RealEye
    _MIN_TIME_DELTA = timedelta(seconds=50)
    found_indices: set[int] = set() # just because I don't want to order it
    pair_indices = []
    for tobii_time in times_tobii:
        current_min_time_diff = timedelta(days=1_000) # more time than is sensible
        found_index = -1
        for i, re_time in enumerate(times_realeye):
            latest_diff = re_time - tobii_time
            if (i not in found_indices) and (_MIN_TIME_DELTA <= latest_diff < current_min_time_diff):
                found_index = i # we've found it. Don't need to log it
                current_min_time_diff = latest_diff
                _logging and print(f"Changed: {current_min_time_diff = }")
        found_indices.add(found_index)
        pair_indices.append(found_index)

    result = [
        (tobii_df.sort("timestamp"), realeye_dfs[re_index].sort("timestamp")) 
        for re_index, tobii_df in zip(pair_indices, tobii_dfs)
    ]
    return result

In [56]:
def test_pairing_algo():
    "Testing whether the algorithm works well. (Hint: it does)"
    # this works
    pairs = find_tobii_realeye_df_pairs(all_tobii_new_year_in_order, all_realeye_new_year_in_order_counted_backward)

    show = list(
        pl.concat(
            [re, tob.rename({"timestamp": "ts_tobii", "X": "X_tobii", "Y": "Y_tobii"})],
            how="horizontal",
        )["ts_tobii", "timestamp", "X_tobii", "Y_tobii", "X", "Y", "source_tsv"].head(1)
        for (tob, re) in pairs
    )
    display(show)

In [57]:
test_pairing_algo()

[shape: (1, 7)
 ┌───────────────────────────────┬─────────────────────┬─────────┬─────────┬─────┬─────┬────────────┐
 │ ts_tobii                      ┆ timestamp           ┆ X_tobii ┆ Y_tobii ┆ X   ┆ Y   ┆ source_tsv │
 │ ---                           ┆ ---                 ┆ ---     ┆ ---     ┆ --- ┆ --- ┆ ---        │
 │ datetime[μs, UTC]             ┆ datetime[μs, UTC]   ┆ i32     ┆ i32     ┆ i64 ┆ i64 ┆ str        │
 ╞═══════════════════════════════╪═════════════════════╪═════════╪═════════╪═════╪═════╪════════════╡
 │ 2079-08-20 17:32:04.154 UTC   ┆ 2025-05-14          ┆ null    ┆ null    ┆ 735 ┆ 520 ┆ Recording8 │
 │                               ┆ 20:05:55.435779 UTC ┆         ┆         ┆     ┆     ┆            │
 └───────────────────────────────┴─────────────────────┴─────────┴─────────┴─────┴─────┴────────────┘,
 shape: (1, 7)
 ┌───────────────────────────────┬─────────────────────┬─────────┬─────────┬─────┬─────┬────────────┐
 │ ts_tobii                      ┆ timestamp       

## ACTUALLY EXPORTING THE DATA

In [58]:
#| export 
from RevChem.common import dt_str_now, date_str_now

In [59]:
#| export
def export_ordered_pairs(
    df_pairs: list[tuple[pl.DataFrame, pl.DataFrame]],
    *,
    output_dir_root: Path,
    output_suffix: Path,
    include_joined_output: bool = False,
    tobii_export_naming: dict[str, str] = {
        "timestamp": "timestamp_tobii",
        "X": "X_tobii",
        "Y": "Y_tobii",
    },
    exported_column_names=[
        "timestamp",
        "X",
        "Y",
    ],
):
    # exported_column_names += list(sorted(tobii_export_naming.values()))
    export_dir = output_dir_root / output_suffix
    for tobii_df, re_df in df_pairs:
        # name the directory after the trial date + subject
        trial_subject_date_and_name = tobii_df["source_tsv"][0]
        try:
            outputdir = export_dir / trial_subject_date_and_name
            outputdir.mkdir(exist_ok=False, parents=True)

            re_df[exported_column_names].write_csv(outputdir / "realeye.csv")
            tobii_df[exported_column_names].write_csv(outputdir / "tobii.csv")
            if include_joined_output:
                pl.concat(
                    [re_df, tobii_df.rename(tobii_export_naming)],
                    how="horizontal",
                ).write_csv(outputdir / "joined.csv")
        except Exception as e:
            print(f"Unexpected exception {e = }, {type(e) = }")
            print(f"{outputdir = }")

In [60]:
EXPORT_ROOT = Path(DATA_ROOT, "..", f"{date_str_now()}-python-outputs").resolve()

In [61]:
def run_data_export():
    pairs = find_tobii_realeye_df_pairs(
        all_tobii_new_year_in_order, all_realeye_new_year_in_order_counted_backward
    )
    suffix = Path(f"{dt_str_now()}-counting-backwards")
    export_ordered_pairs(
        pairs,
        output_dir_root=EXPORT_ROOT,
        output_suffix=suffix,
        include_joined_output=True,
    )

run_data_export()

## A cleaner pairing algorithm
It's written in generic form and mostly pulled from the Internet, but I've sown it together

In [62]:
import numpy as np
from scipy.optimize import linear_sum_assignment
from datetime import datetime, timedelta
from typing import Callable, Optional, TypeVar


T_Time_Haver = TypeVar("T_Time_Haver")
type TimeFieldGetter = Callable[[T_Time_Haver], datetime | "time"]


def time_based_matching(
    list1: list[T_Time_Haver],
    list2: list[T_Time_Haver],
    time_field_getter: TimeFieldGetter,
    max_time_delta: Optional[timedelta] = None,
    penalty_multiplier: float = 1.0,
) -> list[tuple]:
    """
    Find optimal time-based matching minimizing start_time differences.

    Args:
        list1, list2: Lists of objects with time fields
        time_field: Name of the time attribute
        max_time_delta: Maximum allowed time difference (None = no limit)
        penalty_multiplier: Scale factor for time difference penalties

    Returns:
        List of (item1, item2, cost) tuples for matched pairs
    """

    n1, n2 = len(list1), len(list2)

    # Create cost matrix
    cost_matrix = np.full((n1, n2), np.inf)

    for i, item1 in enumerate(list1):
        time1 = time_field_getter(item1)
        for j, item2 in enumerate(list2):
            time2 = time_field_getter(item2)

            time_diff = abs((time1 - time2).total_seconds())

            # Apply time delta constraint if specified
            if max_time_delta and time_diff > max_time_delta.total_seconds():
                continue  # Leave as inf (no match allowed)

            # Cost is the time difference (scaled)
            cost_matrix[i, j] = time_diff * penalty_multiplier

    # Solve using Hungarian algorithm
    row_indices, col_indices = linear_sum_assignment(cost_matrix)

    # Build results
    results = []
    total_cost = 0

    for i, j in zip(row_indices, col_indices):
        if cost_matrix[i, j] < np.inf:
            cost = cost_matrix[i, j] / penalty_multiplier  # Convert back to seconds
            results.append((list1[i], list2[j], cost))
            total_cost += cost

    return results, total_cost

In [63]:
# Example usage
from dataclasses import dataclass
from datetime import datetime, timedelta
from operator import attrgetter

@dataclass
class Event:
    name: str
    start_time: datetime

# Sample data
events_a = [
    Event("Meeting A", datetime(2025, 6, 10, 9, 0)),
    Event("Meeting B", datetime(2025, 6, 10, 14, 0)),
    Event("Meeting C", datetime(2025, 6, 10, 16, 30))
]

events_b = [
    Event("Room 1", datetime(2025, 6, 10, 9, 15)),    # 15 min diff
    Event("Room 2", datetime(2025, 6, 10, 14, 30)),   # 30 min diff  
    Event("Room X", datetime(2025, 6, 10, 14, 28)),   # 28 min diff  
    Event("Room 3", datetime(2025, 6, 10, 16, 0))     # 30 min diff
]

# Find optimal matching
matches, total_cost = time_based_matching(
    events_a, events_b,
    attrgetter("start_time"),
    max_time_delta=timedelta(hours=1),
    penalty_multiplier=1.0
)

for event_a, event_b, time_diff in matches:
    print(f"{event_a.name} -> {event_b.name}: {time_diff/60:.1f} min difference")


Meeting A -> Room 1: 15.0 min difference
Meeting B -> Room X: 28.0 min difference
Meeting C -> Room 3: 30.0 min difference


## Triangle-only data

In [64]:
triangular_root = Path("/Users/stephen/dev/RevChemData/", "20250617-Triangle-coords")

In [65]:
def fiddling_triangle():
    """Show that there are no values triangle-related AOI columns"""
    # df = read_tobii_individual_tsv(
    #     Path(DATA_ROOT,  "..", "20250617-Triangle-coords", "1.Realeye1,2,3 2025-02-19 Pikachu.tsv")
    # )
    triangle_files = [
        Path(DATA_ROOT,  "..", "20250617-Triangle-coords", "1.Realeye1,2,3 2025-02-19 Pikachu.tsv")
    ]
    # triangle_files = Path(DATA_ROOT,  "..", "20250617-Triangle-coords",).iterdir()
    for file in (f for f in triangle_files if "test.tsv" not in f.name): # test.tsv has no contents.
        df = read_tobii_individual_tsv(file)
        columns_tri = [col for col in df.columns if ("triangle" in col) or ("fix" in col.casefold())]
        df_assess = df[
            ["Recording timestamp", "Gaze point X", "Gaze point Y"] + columns_tri
        ]
        for col in columns_tri:
            df_filter = df_assess.filter(pl.col(col).is_not_null())
            if df_filter.shape[0]: # if we have at least one record, show us what you have.
                display(f"File '{file}' - column '{col}'")
                display(df_filter)


fiddling_triangle()