# Tobii and RealEye record pairing
This notebook is to capture the work of pairing Tobii TSV files with RealEye long records.

In [1]:
#| default_exp tobii

In [2]:
#| export
import polars as pl

In [3]:

DATA_ROOT = "../../RevChemData/2025-05-14-Data_Export"
!ls {DATA_ROOT}

README.txt         [30m[43mRealEye[m[m            [30m[43mTobii-All-Snapshot[m[m


In [4]:
TOBII_ROOT = f"{DATA_ROOT}/Tobii-All-Snapshot"
REALEYE_ROOT = f"{DATA_ROOT}/RealEye"

In [5]:
!ls {TOBII_ROOT}

[31m1.Realeye1,2,3 2025-02-19 Pikachu.tsv[m[m
[31m1.Realeye1,2,3 2025-02-19_Ailleen_RE.tsv[m[m
[31m1.Realeye1,2,3 2025-02-20_Hannah_tobii1.tsv[m[m
[31m1.Realeye1,2,3 2025-02-20_PinterestFiend.tsv[m[m
[31m1.Realeye1,2,3 2025-02-21 Charmander.tsv[m[m
[31m1.Realeye1,2,3 2025-02-21_Ponyta.tsv[m[m
[31m1.Realeye1,2,3 2025-02-26_Elinor.tsv[m[m
[31m1.Realeye1,2,3 2025-02-26_Raichu.tsv[m[m
[31m1.Realeye1,2,3 2025-02-28_Flute.tsv[m[m
[31m1.Realeye1,2,3 2025-03-03-H.tsv[m[m
[31m1.Realeye1,2,3 2025-03-05_Blastoise.tsv[m[m
[31m1.Realeye1,2,3 2025-03-05_Luna.tsv[m[m
[31m1.Realeye1,2,3 2025-03-06_Ozempic.tsv[m[m
[31m1.Realeye1,2,3 2025-03-07-Celebi.tsv[m[m
[31m1.Realeye1,2,3 2025-03-07-Cyndaquil.tsv[m[m
[31m1.Realeye1,2,3 2025-03-10-Dakrai.tsv[m[m
[31m1.Realeye1,2,3 2025-03-11_Ninetails.tsv[m[m
[31m1.Realeye1,2,3 2025-04-24_Totadile.tsv[m[m
[31m1.Realeye1,2,3 2025-05-05_Butterfree.tsv[m[m
[31m1.Realeye1,2,3 2025-05-05_Stephe

In [6]:
!wc -l {REALEYE_ROOT}/raw-gazes.csv

     351 ../../RevChemData/2025-05-14-Data_Export/RealEye/raw-gazes.csv


## Game plan
I want the timestamps of every record in the raw gazes
- Each row should have this time stamp, trivially.
    - The first row (of a given file) definitely will have it
- If necessary, I'll have to add some milliseconds to the first record and increment to get the start times of each segment of stimulus

I'm going to compare them to the start times of each of the Tobii records, which I can access from the first line of each of those records
- Tobii explicitly records the start time of the recording and milliseconds since the start of the recording


In [7]:
raw_gazes_csv = pl.read_csv(
    f"{REALEYE_ROOT}/raw-gazes.csv",
    columns=["participant_id", "item_id", "test_created_at", "test_raw_data"],
    schema_overrides={"test_created_at": pl.Datetime},
)

In [8]:
item_ids_in_order = raw_gazes_csv.unique("item_id", maintain_order=True)["item_id"]
item_ids_in_order

item_id
str
"""5b8637a5-c1c7-47cc-a1b4-5abe24…"
"""83e23c46-4fe9-483b-abb5-4ba3cc…"
"""f456cacf-9abd-4825-a964-859f1b…"
"""d4bdad0c-13ae-4276-b1f4-007c2a…"
"""e9978edb-9cdf-4db7-acef-3a08df…"
"""75ee4147-26ff-48ff-8c88-1743c3…"
"""63dc6493-df28-4aba-9d79-1dedc3…"
"""efff95c6-ae5c-482a-9388-dafb00…"
"""e5762660-01a6-4e77-8e04-066b96…"
"""b1641c81-54ff-4370-ab32-8476ef…"


In [9]:
# Don't bother with `test_id`. It's unique to the participant_id-item_id pair.
# See the following queries for proof
#
# raw_gazes_csv["participant_id"].unique_counts()
# raw_gazes_csv["item_id"].unique_counts()
# raw_gazes_csv["test_id"].unique_counts()

In [10]:
realeye_raw_sorted = raw_gazes_csv.sort(by="test_created_at")
display(realeye_raw_sorted["participant_id"][0])  # first ID, should match Pikachu
display(realeye_raw_sorted.columns)

'5f08f1c2-e9f3-4adb-9fb4-a68b3a1e12a9'

['participant_id', 'item_id', 'test_created_at', 'test_raw_data']

### Utility: Group-by

In [11]:
#| export
from RevChem.common import group_by

## Note - Timing
RealEye uses the UTC timestamp. I don't see this on their documentation, but it appears to agree with the Tobii recording.

In [12]:
realeye_raw_sorted

participant_id,item_id,test_created_at,test_raw_data
str,str,datetime[μs],str
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-23 18:37:18,"""[[1085,721,36,0,1919,593],[102…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""83e23c46-4fe9-483b-abb5-4ba3cc…",2024-10-23 18:37:18,"""[[911,667,51,0,1919,593],[956,…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""f456cacf-9abd-4825-a964-859f1b…",2024-10-23 18:37:18,"""[[1338,648,44,0,1919,593],[131…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""d4bdad0c-13ae-4276-b1f4-007c2a…",2024-10-23 18:37:18,"""[[1031,732,39,0,1919,593],[140…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""e9978edb-9cdf-4db7-acef-3a08df…",2024-10-23 18:37:18,"""[[1025,642,36,0,1919,593],[106…"
…,…,…,…
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""75ee4147-26ff-48ff-8c88-1743c3…",2025-05-14 20:10:48,"""[[949,747,40,0,1919,608],[875,…"
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""63dc6493-df28-4aba-9d79-1dedc3…",2025-05-14 20:10:48,"""[[950,749,25,0,1919,608],[982,…"
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""efff95c6-ae5c-482a-9388-dafb00…",2025-05-14 20:10:48,"""[[431,502,35,0,1919,608],[499,…"
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""e5762660-01a6-4e77-8e04-066b96…",2025-05-14 20:10:48,"""[[1914,505,45,0,1919,608],[166…"


### Make sure that the RealEye groups are concatenated consistently

Ordering should be retained in the group-by. This is just double-checking that.

In [13]:
def first(tup: tuple): return tup[0]
def second(tup: tuple): return tup[1]

In [14]:
# checking that the order is retained of the items
item_ids_in_order.to_list() == list(
    map(
        second, # second item is item_id
        group_by(lambda df_row: df_row[0], # [0] -> participant_id
         realeye_raw_sorted.iter_rows())[ # key into the a particular participant_id / trial
            "b9682fc8-11b0-4cac-87f9-ab9a98164ecf"
        ],
    )
)

True

In [None]:
# | export


def read_tobii_individual_tsv(
    path_to_tsv: str,
    schema_overrides: dict[str, pl.DataType] = {},
    *,
    change_timestamp_to_datetime: bool = True,  # whether to use this record's derived datetime to calculate per-observation datetimestamps in "Recording timestamp"
) -> pl.DataFrame:
    "Reads a trial-exported Tobii TSV, with the minimal columns that we need"
    individual_tsv = pl.read_csv(
        path_to_tsv,
        separator="\t",
        infer_schema=True,
        schema_overrides={
            "Recording start time UTC": pl.Time,
            "Recording start time": pl.Time,
            "Gaze point X": pl.Int32,
            "Gaze point Y": pl.Int32,
        }
        | schema_overrides,
    )
    start_date_utc = pl.col("Recording date UTC").str.to_date("%m/%d/%Y")
    start_time_utc = pl.col("Recording start time UTC")

    start_dt_utc = (
        start_date_utc.dt.combine(
            pl.time(
                start_time_utc.dt.hour(),
                start_time_utc.dt.minute(),
                start_time_utc.dt.second(),
                start_time_utc.dt.microsecond(),
            )
        )
        .dt.replace_time_zone(time_zone="UTC")
        .alias("start_dt_utc")
    )
    start_time_ = pl.col("Recording start time")

    start_date_ = (
        pl.col("Recording date").str.to_date("%m/%d/%Y")
        if "Recording date" in individual_tsv.columns
        else (
            start_dt_utc.dt.convert_time_zone(time_zone="America/New_York")
            .dt.date()
            .alias("Recording date")
        )
    )
    start_dt_tz = (
        start_date_.dt.combine(
            pl.time(
                start_time_.dt.hour(),
                start_time_.dt.minute(),
                start_time_.dt.second(),
                start_time_.dt.microsecond(),
            )
        )
        .dt.replace_time_zone(time_zone="America/New_York")
        .alias("start_dt_tz")
    )

    result = individual_tsv.with_columns(
        start_date_utc,
        start_date_,
        start_dt_utc,
        start_dt_tz,
    )
    if change_timestamp_to_datetime:
        result = result.with_columns(  # separate call so the columns that we fiddle are guaranteed present
            tobii_timestamp_to_datetime(datetime_col="start_dt_utc")
        )
    # display(plan) when using a LazyFrame to get debugging support
    return result

Now hacking my way to figuring out what the recording timestamp
- [documentation](https://go.tobii.com/Tobii-Pro-Lab-data-export-info) says its a microsecond, but that's not working on _all_ the data, just a preview (see below)

In [16]:
#| export
def tobii_timestamp_to_datetime(
    datetime_col: str = "Recording start datetime UTC", # column with the recording start datetime, UTC or TZ-specific.
    timestamp_col: str = "Recording timestamp", # the integer column representing the microseconds since the recording started
    *,
    overwrite: bool = True, # Whether, the `timestamp_col` will have a datetime type in the result, or (if False) a new column is created
) -> pl.DataFrame:
    """Update `timestamp_col` to be an increasing datetime rather than the default (i64 or int).

    Corrects the `timestamp_col` of `df` to be a `pl.Datetime`, to ease legibility and computation.
    Sums the `timestamp_col` with that of the reference `datetime_col`, incrementing the time forward.

    Returns:
        A dataframe with the described change to the `timestamp_col`
    """
    new_name = timestamp_col if overwrite else f"{timestamp_col}__new_dt_time"
    new_column = pl.col(datetime_col) + pl.duration(microseconds=timestamp_col)
    new_column = new_column.alias(new_name)

    return new_column

In [333]:
participant_tsv_pikachu = read_tobii_individual_tsv(
    TOBII_ROOT + "/1.Realeye1,2,3 2025-02-19 Pikachu.tsv",
)

In [18]:
participant_tsv_ninetails = read_tobii_individual_tsv(TOBII_ROOT+ "/1.Realeye1,2,3 2025-03-11_Ninetails.tsv").with_columns(
    tobii_timestamp_to_datetime(datetime_col="start_dt_utc")
)
participant_tsv_ninetails.shape

(61617, 165)

## Preparing a clean CSV for each of the Tobii runs

In [19]:
participant_tsv_ninetails[["Recording timestamp", "Recording start time UTC"]]

Recording timestamp,Recording start time UTC
"datetime[μs, UTC]",time
2025-03-11 21:20:24.432 UTC,21:20:24.432
2025-03-11 21:20:24.557179 UTC,21:20:24.432
2025-03-11 21:20:24.565513 UTC,21:20:24.432
2025-03-11 21:20:24.573848 UTC,21:20:24.432
2025-03-11 21:20:24.582182 UTC,21:20:24.432
…,…
2025-03-11 21:28:49.554316 UTC,21:20:24.432
2025-03-11 21:28:49.562650 UTC,21:20:24.432
2025-03-11 21:28:49.570983 UTC,21:20:24.432
2025-03-11 21:28:49.579316 UTC,21:20:24.432


In [68]:
[col for col in participant_tsv_ninetails.columns ]

['Recording timestamp',
 'Computer timestamp',
 'Sensor',
 'Project name',
 'Export date',
 'Participant name',
 'trials',
 'Recording name',
 'Recording date',
 'Recording date UTC',
 'Recording start time',
 'Recording start time UTC',
 'Recording duration',
 'Timeline name',
 'Recording Fixation filter name',
 'Recording software version',
 'Recording resolution height',
 'Recording resolution width',
 'Recording monitor latency',
 'Average calibration accuracy (mm)',
 'Average calibration precision SD (mm)',
 'Average calibration precision RMS (mm)',
 'Average calibration accuracy (degrees)',
 'Average calibration precision SD (degrees)',
 'Average calibration precision RMS (degrees)',
 'Average calibration accuracy (pixels)',
 'Average calibration precision SD (pixels)',
 'Average calibration precision RMS (pixels)',
 'Average validation accuracy (mm)',
 'Average validation precision SD (mm)',
 'Average validation precision RMS (mm)',
 'Average validation accuracy (degrees)',
 'Av

In [69]:
COLUMNS_TOBII = [
    "Recording timestamp",
    "Gaze point X",
    "Gaze point Y",
]

#### Coordinate nullity analysis
Looking at the coordinates captured across X- and Y- to assess the damage and reach understanding

In [21]:
participant_tsv_pikachu.select(
    pl.col("Gaze point X").value_counts().alias("Gaze point X value counts")
).unnest("Gaze point X value counts").sort(by="Gaze point X").select(
    pl.col("count").alias("Metacount").value_counts()
).unnest(
    "Metacount"
)

Metacount,count
u32,u32
42,7
46,6
5,76
44,3
38,12
…,…
101,1
88,5
36,11
17,57


In [22]:
# How often did we look at a given x coordinate
participant_tsv_pikachu.select(
    pl.col("Gaze point X").cast(pl.Int64).value_counts().alias("Gaze point X value counts")
).unnest("Gaze point X value counts").sort(by="count")

Gaze point X,count
i64,u32
2528,1
2487,1
25,1
2023,1
2089,1
…,…
1339,101
982,102
978,103
1341,104


In [23]:
# how many non-null X-values does Pikachu have?
participant_tsv_pikachu.select(
    pl.col("Gaze point X").value_counts().alias("Gaze point X value counts")
).unnest("Gaze point X value counts").sort(by="Gaze point X").filter(
    pl.col("Gaze point X").is_null()
).select(
    non_null=participant_tsv_pikachu.shape[0] - pl.col("count").cast(pl.Int64)
)

non_null
i64
38205


In [None]:
from pathlib import Path

../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-03-07-Cyndaquil.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-03-06_Ozempic.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-02-26_Raichu.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-03-03-H.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-03-05_Luna.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-02-21 Charmander.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 Recording11.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 2025-02-20_Hannah_tobii1.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 Recording10.tsv
../../RevChemData/2025-05-14-Data_Export/Tobii-All-Snapshot/1.Realeye1,2,3 Recording12.tsv
../../RevChemData/2025-05-14-Data_Export

column_0
object
"shape: (1, 4) ┌───────────┬────────┬──────────┬──────────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪══════════════════════════╡ │ 55856 ┆ 5355 ┆ 50501 ┆ 2025-03-07-Cyndaquil.tsv │ └───────────┴────────┴──────────┴──────────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬────────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪════════════════════════╡ │ 54375 ┆ 4589 ┆ 49786 ┆ 2025-03-06_Ozempic.tsv │ └───────────┴────────┴──────────┴────────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬───────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪═══════════════════════╡ │ 54633 ┆ 5527 ┆ 49106 ┆ 2025-02-26_Raichu.tsv │ └───────────┴────────┴──────────┴───────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬──────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪══════════════════╡ │ 57352 ┆ 4540 ┆ 52812 ┆ 2025-03-03-H.tsv │ └───────────┴────────┴──────────┴──────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬─────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪═════════════════════╡ │ 55644 ┆ 5949 ┆ 49695 ┆ 2025-03-05_Luna.tsv │ └───────────┴────────┴──────────┴─────────────────────┘"
…
"shape: (1, 4) ┌───────────┬────────┬──────────┬──────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪══════════════════════╡ │ 54174 ┆ 4525 ┆ 49649 ┆ 2025-05-07_Eevee.tsv │ └───────────┴────────┴──────────┴──────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬──────────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪══════════════════════════╡ │ 57481 ┆ 5892 ┆ 51589 ┆ 2025-05-08_Relicanth.tsv │ └───────────┴────────┴──────────┴──────────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪════════════════════╡ │ 73002 ┆ 10331 ┆ 62671 ┆ 2025_03_03_Mew.tsv │ └───────────┴────────┴──────────┴────────────────────┘"
"shape: (1, 4) ┌───────────┬────────┬──────────┬────────────────────────┐ │ n_records ┆ n_null ┆ non_null ┆ name │ │ --- ┆ --- ┆ --- ┆ --- │ │ i32 ┆ u32 ┆ u32 ┆ str │ ╞═══════════╪════════╪══════════╪════════════════════════╡ │ 86942 ┆ 48737 ┆ 38205 ┆ 2025-02-19 Pikachu.tsv │ └───────────┴────────┴──────────┴────────────────────────┘"


In [None]:
def review_all_trial_null_counts_X():
    record_nullity_counts = []
    for tsv_file in Path(TOBII_ROOT).iterdir():
        print(tsv_file)
        df = read_tobii_individual_tsv(tsv_file)
        n_null_gaze_x = df.select(
            pl.col("Gaze point X").value_counts().alias("Gaze point X value counts")
        ).unnest("Gaze point X value counts").sort(by="Gaze point X").filter(
            pl.col("Gaze point X").is_null()
        ).select(
            n_records=df.shape[0],
            n_null=pl.col("count"),
            non_null=df.shape[0] - pl.col("count"),
        ).with_columns(
            name=pl.lit(" ".join(tsv_file.name.split(" ")[1:]))
        )
        record_nullity_counts.append(n_null_gaze_x)

    nullity_df = pl.DataFrame(record_nullity_counts)
    display(nullity_df)

review_all_trial_null_counts_X()

#### Data Preparation (for real)

In [25]:
COLUMN_RENAMING_REALEYE_TO_CSV = {
    "timestamp": "timestamp",
    "gaze_point_x": "X",
    "gaze_point_y": "Y"
}
COLUMN_RENAMING_TOBII_TO_CSV = {
    "Recording timestamp": "timestamp",
    "Gaze point X": "X",
    "Gaze point Y": "Y"
}

In [26]:
#| export
from RevChem.realeye import iter_parse_raw_data

In [27]:
# we basically want this for all the coordiante pairs in the string RealEye gives us
pl.DataFrame([
    {"gaze_point_x": sextuple[0], "gaze_point_y": sextuple[1]}
    for sextuple in iter_parse_raw_data(raw_gazes_csv[15]["test_raw_data"].item())
])

gaze_point_x,gaze_point_y
i64,i64
879,388
908,385
899,450
908,419
871,319
…,…
984,433
958,322
984,421
967,336


In [28]:
#| export
from typing import Iterable
def itersize(any_iter: Iterable) -> int:
    count = 0
    for _ in any_iter: count += 1
    return count

In [None]:
#| export
def enhance_realeye_metadata(realeye_df: pl.DataFrame) -> pl.DataFrame:
    "Decorate the metadata with addition information"
    # use pl.Struct to dynamically instruct the strong typing in Rust (which drives Polars)
    re_coord_type = pl.Struct({"gaze_point_x": pl.Int64, "gaze_point_y": pl.Int64})
    return realeye_df.with_columns(
        # unpack into dictionaries that function as tuples. Call them coordinate pairs
        coordinate_pairs=pl.col("test_raw_data").map_elements(
            lambda s: [
                {"gaze_point_x": sextuple[0], "gaze_point_y": sextuple[1]}
                for sextuple in iter_parse_raw_data(s)
            ],
            return_dtype=pl.List(re_coord_type),
        ),
        # count the len of the list of tuples RealEye exported into that one row. Call it n_elements
        n_elements=pl.col("test_raw_data").map_elements(
            lambda s: itersize(iter_parse_raw_data(s)), return_dtype=pl.Int32
        ),  # call that the number of elements
    ).drop("test_raw_data")
    # .sort(by="test_created_at") # temporal sort is not necessary. We've sorted the data already.


def read_realeye_raw_gazes_csv(
    path_to_csv: str | Path, decorated: bool = True
) -> pl.DataFrame:
    "Real the RealEye raw-gazes.csv, decorating with field of RealEyeStruct if `decorated` == True"
    raw_csv = pl.read_csv(
        path_to_csv,
        columns=["participant_id", "item_id", "test_created_at", "test_raw_data"],
        schema_overrides={"test_created_at": pl.Datetime},
    ).sort(by="test_created_at")

    if decorated:
        return enhance_realeye_metadata(raw_csv)
    else:
        return raw_csv

In [30]:
# unpack that raw string for all of the RealEye data
raw_gazes_enhanced = enhance_realeye_metadata(realeye_raw_sorted)

In [31]:
raw_gazes_enhanced.filter(pl.col("test_created_at") > pl.date(2025, 1, 1))

participant_id,item_id,test_created_at,coordinate_pairs,n_elements
str,str,datetime[μs],list[struct[2]],i32
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2025-02-19 20:17:11,"[{960,590}, {1041,652}, … {700,726}]",32
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""83e23c46-4fe9-483b-abb5-4ba3cc…",2025-02-19 20:17:11,"[{910,672}, {908,660}, … {924,949}]",158
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""f456cacf-9abd-4825-a964-859f1b…",2025-02-19 20:17:11,"[{718,630}, {753,620}, … {906,646}]",31
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""d4bdad0c-13ae-4276-b1f4-007c2a…",2025-02-19 20:17:11,"[{947,669}, {1015,692}, … {1290,604}]",1867
"""b9682fc8-11b0-4cac-87f9-ab9a98…","""e9978edb-9cdf-4db7-acef-3a08df…",2025-02-19 20:17:11,"[{1079,524}, {1142,527}, … {1187,555}]",32
…,…,…,…,…
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""75ee4147-26ff-48ff-8c88-1743c3…",2025-05-14 20:10:48,"[{949,747}, {875,678}, … {1246,444}]",251
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""63dc6493-df28-4aba-9d79-1dedc3…",2025-05-14 20:10:48,"[{950,749}, {982,690}, … {1287,674}]",1414
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""efff95c6-ae5c-482a-9388-dafb00…",2025-05-14 20:10:48,"[{431,502}, {499,556}, … {1391,454}]",249
"""9f5d6b55-c4cf-42d0-bd36-5f2b54…","""e5762660-01a6-4e77-8e04-066b96…",2025-05-14 20:10:48,"[{1914,505}, {1664,566}, … {850,310}]",1411


In [32]:
#| export
from typing import NamedTuple
from datetime import datetime

class RealEyeStruct(NamedTuple):
    "Truncated struct containing the minimum necessary information to make RealEye exports useful"
    participant_id: str # the participant being tested
    item_id: str # the stimulus being shown
    test_created_at: datetime # when the web browser started running (NOT THE SAME AS THE STIMULUS START TIME)
    coordinate_pairs: list[dict[str, int]] # [{"gaze_point_x": <int>, "gaze_point_y": <int>}]
    n_elements: int # number of coordinate pairs in <self>.coordinate_pairs

    @classmethod
    def from_tuple(cls, tuple_) -> 'RealEyeStruct':
        return cls(*tuple_)

In [33]:
#| export
from datetime import timedelta, datetime, UTC

def unroll_realeye_dataframe_into_record_dataframes(df: pl.DataFrame):
    """Convert each row of a RealEye-exported CSV into a dataframe of timestamped records
    
    We assume 30 Hz data is given, and so concatenate all dataframes in the order they are ingested/converted
    with a rolling 1/30th of a second added to the time of the first record encountered.
    """
    time_inc = timedelta(seconds=1/30) # 30 Hz data
    dfs = []
    for row in map(RealEyeStruct.from_tuple, df.rows()):
        pseudo_start_time = row.test_created_at # .replace(second=0)
        row_df = pl.DataFrame({
            "timestamp": row.test_created_at.replace(tzinfo=UTC), # we assert the RealEye dataframe is UTC timestamped, per the docs
            "X": [pairs["gaze_point_x"] for pairs in row.coordinate_pairs],
            "Y": [pairs["gaze_point_y"] for pairs in row.coordinate_pairs]
        })
        dfs.append((pseudo_start_time,row_df))

    # group by -> concat all the columns -> count the time with all the data in order
    # NOTE: grouping unique to the minute, which should be shared among RealEye, though the second may differ
    # in particular, several entries are exactly 1 second apart.
    # We assume the later of these entries is "next" chronologically.
    grouped_dfs = group_by(lambda tup: tup[0].replace(second=0, microsecond=0), dfs)
    output_dfs = []
    for start_time, group in grouped_dfs.items():
        df_at_start_time = pl.concat([df for _, df in group])
        start_time = df_at_start_time["timestamp"][0]
        end_time = start_time + time_inc * (df_at_start_time.shape[0] - 1)
        df_with_time_corrected = df_at_start_time.with_columns(
            timestamp=pl.datetime_range(start_time, end_time, time_inc)
        )
        output_dfs.append(df_with_time_corrected)
    
    return output_dfs

In [81]:
#| export
# from RevChem.common import list_concat
def list_concat(lists: list[list]) -> list:
    result = [
        item for l_ in lists for item in l_
    ]
    return result

In [91]:
def unroll_realeye_df_counting_backwards(df: pl.DataFrame):
    """Convert RealEye Raw record to the unrolled record, with the start_time interpreted as the end time.

    Video evidence suggests that the start_time field is actually more like the "recording completed at" time
    i.e. the end time.
    """
    time_inc = timedelta(seconds=1 / 30)  # 30 Hz data
    dfs = []
    for row in map(RealEyeStruct.from_tuple, df.rows()):
        pseudo_start_time = row.test_created_at
        row_df = pl.DataFrame(
            {
                # we assert the RealEye dataframe is UTC timestamped, per the docs
                # "timestamp": row.test_created_at.replace(tzinfo=UTC),
                "X": [pairs["gaze_point_x"] for pairs in row.coordinate_pairs],
                "Y": [pairs["gaze_point_y"] for pairs in row.coordinate_pairs],
            }
        )
        dfs.append((pseudo_start_time, row_df))

    # group by -> concat all the columns -> count the time with all the data in order
    # NOTE: grouping unique to the minute, which should be shared among RealEye, though the second may differ
    # in particular, several entries are exactly 1 second apart.
    # We assume the later of these entries is "next" chronologically.
    grouped_dfs = group_by(lambda tup: tup[0], dfs)
    dfs = list_concat(grouped_dfs.values())
    print(f"{type(dfs) = } and {type(dfs[0]) = } and {len(dfs) = }")
    grouped_dfs = group_by(lambda tup: tup[0].replace(second=0, microsecond=0), dfs)

    output_dfs = []
    for end_time, group in grouped_dfs.items():
        concatted_df = pl.concat([df for _, df in group])
        end_time = end_time.replace(tzinfo=UTC) # concatted_df["timestamp"][0]
        # assert end_time == end_time_, f"{end_time} != {end_time_}"
        start_time = end_time - time_inc * (concatted_df.shape[0] - 1)

        # overwrites the timestamp column by making a new "timestamp" column with the same name
        df_with_time_corrected = concatted_df.with_columns(
            timestamp=pl.datetime_range(start_time, end_time, time_inc)
        )
        output_dfs.append(df_with_time_corrected)

    return output_dfs

In [34]:
all_realeye_data_as_exportable = unroll_realeye_dataframe_into_record_dataframes(raw_gazes_enhanced)

In [92]:
all_realeye_data_as_exportable_counted_backward = unroll_realeye_df_counting_backwards(raw_gazes_enhanced)

type(dfs) = <class 'list'> and type(dfs[0]) = <class 'tuple'> and len(dfs) = 350


In [35]:
all_realeye_data_as_exportable

[shape: (7_279, 3)
 ┌────────────────────────────────┬──────┬─────┐
 │ timestamp                      ┆ X    ┆ Y   │
 │ ---                            ┆ ---  ┆ --- │
 │ datetime[μs, UTC]              ┆ i64  ┆ i64 │
 ╞════════════════════════════════╪══════╪═════╡
 │ 2024-10-23 18:37:18 UTC        ┆ 1085 ┆ 721 │
 │ 2024-10-23 18:37:18.033333 UTC ┆ 1027 ┆ 765 │
 │ 2024-10-23 18:37:18.066666 UTC ┆ 986  ┆ 776 │
 │ 2024-10-23 18:37:18.099999 UTC ┆ 1284 ┆ 729 │
 │ 2024-10-23 18:37:18.133332 UTC ┆ 1058 ┆ 842 │
 │ …                              ┆ …    ┆ …   │
 │ 2024-10-23 18:41:20.464242 UTC ┆ 847  ┆ -36 │
 │ 2024-10-23 18:41:20.497575 UTC ┆ 661  ┆ 26  │
 │ 2024-10-23 18:41:20.530908 UTC ┆ 881  ┆ 64  │
 │ 2024-10-23 18:41:20.564241 UTC ┆ 898  ┆ 26  │
 │ 2024-10-23 18:41:20.597574 UTC ┆ 1193 ┆ -35 │
 └────────────────────────────────┴──────┴─────┘,
 shape: (7_110, 3)
 ┌────────────────────────────────┬──────┬─────┐
 │ timestamp                      ┆ X    ┆ Y   │
 │ ---                        

In [93]:
all_realeye_data_as_exportable_counted_backward

[shape: (7_279, 3)
 ┌──────┬─────┬────────────────────────────────┐
 │ X    ┆ Y   ┆ timestamp                      │
 │ ---  ┆ --- ┆ ---                            │
 │ i64  ┆ i64 ┆ datetime[μs, UTC]              │
 ╞══════╪═════╪════════════════════════════════╡
 │ 1085 ┆ 721 ┆ 2024-10-23 18:32:57.402426 UTC │
 │ 1027 ┆ 765 ┆ 2024-10-23 18:32:57.435759 UTC │
 │ 986  ┆ 776 ┆ 2024-10-23 18:32:57.469092 UTC │
 │ 1284 ┆ 729 ┆ 2024-10-23 18:32:57.502425 UTC │
 │ 1058 ┆ 842 ┆ 2024-10-23 18:32:57.535758 UTC │
 │ …    ┆ …   ┆ …                              │
 │ 847  ┆ -36 ┆ 2024-10-23 18:36:59.866668 UTC │
 │ 661  ┆ 26  ┆ 2024-10-23 18:36:59.900001 UTC │
 │ 881  ┆ 64  ┆ 2024-10-23 18:36:59.933334 UTC │
 │ 898  ┆ 26  ┆ 2024-10-23 18:36:59.966667 UTC │
 │ 1193 ┆ -35 ┆ 2024-10-23 18:37:00 UTC        │
 └──────┴─────┴────────────────────────────────┘,
 shape: (7_110, 3)
 ┌──────┬─────┬────────────────────────────────┐
 │ X    ┆ Y   ┆ timestamp                      │
 │ ---  ┆ --- ┆ ---           

In [36]:
#| export
def apply(s, transform): return transform(s)

def clean_tsv_file_name(fname: str) -> str:
    from functools import reduce

    transformations = [
        lambda s: s.split("1.Realeye1,2,3")[1].strip(),
        lambda s: s.rstrip(".tsv")
    ]

    return reduce(apply, transformations, fname)

In [37]:
def test_clean_tsv_file_name():
    from fastcore.test import test_eq
    dummy_data = [  # names we need to be able to deal with
        "1.Realeye1,2,3 2025-03-03-H.tsv",
        "1.Realeye1,2,3 2025-03-03_Mew.tsv",
        "1.Realeye1,2,3 2025-03-05 Blastoise.tsv",
    ]

    output_names = [clean_tsv_file_name(data) for data in dummy_data]

    test_eq(all("Realeye" not in name for name in output_names), True)
    test_eq(all(".tsv" not in name for name in output_names), True)


test_clean_tsv_file_name()

In [38]:
# Tobii data
all_tobii_data_as_exportable = [
    read_tobii_individual_tsv(tsv_file)
    .with_columns(tobii_timestamp_to_datetime(datetime_col="start_dt_utc"))[
        COLUMNS_TOBII
    ]
    .rename(COLUMN_RENAMING_TOBII_TO_CSV)
    .with_columns(source_tsv=pl.lit(clean_tsv_file_name(tsv_file.name)))
    for tsv_file in Path(TOBII_ROOT).iterdir()
]

In [94]:
len(all_realeye_data_as_exportable), len(all_tobii_data_as_exportable), len(all_realeye_data_as_exportable_counted_backward)

(35, 33, 35)

2025-04-22 11:50 note
Temporary conclusion: one of the RealEye records is a dud. Hard to know which one.

March 7th look suspect for being 1/4 the usual length of a record, resulting in a trial that's only a minute long.
- Every other recording is about 5 minute long.

2025-04-22 13:27 update
- After correcting the joining of the dataframes for the RealEye data, there's now an extra Tobii session
   - it looks like it's the Pichu2 run, which is exceedingly short for some reason.
   - That sorts that out.

This is the end.

2025-06-10 11:42 update
- The pairing algorithm has been sound but the code wasn't being called correctly
- In the analysis, it seems that there is a scarcity of Tobii rows
   - Why? Because RealEye sequences run for about 4 minutes within a ~10 min Tobii trial, but the Tobii sequence ends before then - usually seconds after the RealEye run begins
   - Ours is the task of seing if we have some data loss in the pipeline that this notebook has become.

In [243]:
# | export
from datetime import tzinfo, UTC
from typing import TypeVar
from RevChem.common import partition, Predicate


def filter_to_newyear_and_sort_by_timestamp(
    dfs: list[pl.DataFrame],
) -> list[pl.DataFrame]:
    return sorted(
        filter(
            lambda df: (df["timestamp"][0] >= datetime(2025, 1, 1, tzinfo=UTC)),
            dfs,
        ),
        key=lambda df: df["timestamp"][0],
    )


_T = TypeVar("_T")
type _Criterion[_T] = tuple[Predicate[_T], str]


def _filter_by_criteria_loop(
    dfs: list[pl.DataFrame],
    criteria: list[_Criterion],
    verbose: bool = True,
) -> tuple[list[pl.DataFrame], list[list[pl.DataFrame]]]:

    misses_in_order = []
    for criterion_index, (crit, explanation) in enumerate(criteria):
        dfs, missed = partition(crit, dfs)
        if missed:
            print(f'{len(missed)} dfs failed the {criterion_index}-th criterion "{explanation}"')
            if verbose:
                print(
                    f"The following previews show the dfs that missed the criteria: {explanation}"
                )
                with pl.Config(tbl_rows=10):
                    for miss_df in missed:
                        print(miss_df.head(2).with_columns(count=len(miss_df)))
        misses_in_order.append(missed)

    return dfs, misses_in_order


def filter_tobii_dfs_by_new_years_heuristics(
    dfs: list[pl.DataFrame],
) -> list[pl.DataFrame]:
    """
    Remove dataframes that do not meet our heuristic and discovered criteria for a valid Tobii trial.

    Returns:
        All of the DataFrames that meet all filtering heuristics and invariants
    """
    criteria = [
        (
            (lambda df: len(df) >= 51_000),
            "len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ",
        ),
        (
            (lambda df: all((name not in df["source_tsv"][0] for name in ["Pichu2"]))),
            "found df in black list should not be test recordings",
        ),
    ]
    # If we ever need to do hard debugging, we can name that _ and inspect. Here, we're just releasing the memory.
    dfs, _ = _filter_by_criteria_loop(dfs, criteria)

    return dfs


def filter_realeye_dfs_by_new_years_heuristics(
    dfs: list[pl.DataFrame],
) -> list[pl.DataFrame]:
    """
    Remove dataframes that do not meet our heuristic and discovered criteria for a valid Tobii trial.

    Returns:
        All of the DataFrames that meet all filtering heuristics and invariants
    """
    criteria = [
        (
            (lambda df: len(df) >= 5_000),
            "len(df) >= 5k, corresponding to >= 2.8 minutes of recording @ 30-HZ",
        ),
    ]
    # If we ever need to do hard debugging, we can name that _ and inspect. Here, we're just releasing the memory.
    dfs, _ = _filter_by_criteria_loop(dfs, criteria)

    return dfs

In [95]:
all_realeye_new_year_in_order = filter_realeye_dfs_by_new_years_heuristics(
    filter_to_newyear_and_sort_by_timestamp(all_realeye_data_as_exportable)
)
all_tobii_new_year_in_order = filter_tobii_dfs_by_new_years_heuristics(
    filter_to_newyear_and_sort_by_timestamp(all_tobii_data_as_exportable)
)
all_realeye_new_year_in_order_counted_backward = filter_realeye_dfs_by_new_years_heuristics(
    filter_to_newyear_and_sort_by_timestamp(all_realeye_data_as_exportable_counted_backward)
)

1 criterion "len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ"
The following previews show the dfs that missed the criteria: len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ
shape: (2, 5)
┌────────────────────────────────┬──────┬──────┬──────────────────┬───────┐
│ timestamp                      ┆ X    ┆ Y    ┆ source_tsv       ┆ count │
│ ---                            ┆ ---  ┆ ---  ┆ ---              ┆ ---   │
│ datetime[μs, UTC]              ┆ i32  ┆ i32  ┆ str              ┆ i32   │
╞════════════════════════════════╪══════╪══════╪══════════════════╪═══════╡
│ 2025-02-27 23:36:21.203 UTC    ┆ null ┆ null ┆ 2025_2_27_Pichu2 ┆ 9934  │
│ 2025-02-27 23:36:21.325819 UTC ┆ null ┆ null ┆ 2025_2_27_Pichu2 ┆ 9934  │
└────────────────────────────────┴──────┴──────┴──────────────────┴───────┘
1 criterion "DataFrames should not be test recordings"
The following previews show the dfs that missed the criteria: DataFrames should not be test recordings
shape: (

## On the Length mismatch problem i.e. time series not being alignable without hacky code and no sense-making
We know this from the analysis alone
* a **Tobii** recording **should be at least 51k data points** -> @120HZ is **7 minutes**. 
* A **RealEye** recording **should be at least 7k data points** -> @30HZ is **3.9 minutes**. With a minute for each stimulus, and about 40 seconds of calibration this makes sense.  That said, I think it's supposed to longer but whatever. We'll have to check the audio recordings to have that for sure.

Why don't we do this manually?
* Because we have to define the RealEye association in code anyway, because it has no meaningful identifier.
* Might as well document the heuristic by which they are associated, *in said code*.

In [96]:
len(all_realeye_new_year_in_order), len(all_tobii_new_year_in_order), len(all_realeye_new_year_in_order_counted_backward)

(28, 26, 28)

In [44]:
list(
    pl.concat(
        [re, tob.rename({"timestamp": "ts_tobii", "X": "X_tobii", "Y": "Y_tobii"})],
        how="horizontal",
    )["ts_tobii", "timestamp", "X_tobii", "Y_tobii", "X", "Y", "source_tsv"]
    for (re, tob) in zip(
        [df.head(1) for df in all_realeye_new_year_in_order],
        [df.head(1) for df in all_tobii_new_year_in_order],
    )
)

[shape: (1, 7)
 ┌───────────────────┬─────────────────────────┬─────────┬─────────┬─────┬─────┬────────────────────┐
 │ ts_tobii          ┆ timestamp               ┆ X_tobii ┆ Y_tobii ┆ X   ┆ Y   ┆ source_tsv         │
 │ ---               ┆ ---                     ┆ ---     ┆ ---     ┆ --- ┆ --- ┆ ---                │
 │ datetime[μs, UTC] ┆ datetime[μs, UTC]       ┆ i32     ┆ i32     ┆ i64 ┆ i64 ┆ str                │
 ╞═══════════════════╪═════════════════════════╪═════════╪═════════╪═════╪═════╪════════════════════╡
 │ 2025-02-19        ┆ 2025-02-19 20:17:11 UTC ┆ null    ┆ null    ┆ 960 ┆ 590 ┆ 2025-02-19 Pikachu │
 │ 20:06:09.486 UTC  ┆                         ┆         ┆         ┆     ┆     ┆                    │
 └───────────────────┴─────────────────────────┴─────────┴─────────┴─────┴─────┴────────────────────┘,
 shape: (1, 7)
 ┌─────────────────────┬─────────────────────┬─────────┬─────────┬──────┬─────┬─────────────────────┐
 │ ts_tobii            ┆ timestamp           ┆ X_to

In-order zip-based pairing is inaccurate and unintelligent.

Algorithm that should solve it simply

0. Prep: dfs -> tuples of starting time and df, to ease the comparison function
    - alt: write a comparator that accesses that property
1. Brutal force
    1. For each tobii session, find the closest RealEye session
    2. For logging purposes, articulate the `pl.Duration` or `datetime.TimeDelta`: `end - start` for RealEye
2. More intelligent
    1. Pop found indices from the available indices to search, so you don't double-select a `DataFrame`
    2. Use the `source_tsv` property as a key into a dictionary, to store the found DataFrame once its found
        - or its index. Doesn't really matter


In [46]:
#| export
from datetime import datetime, timedelta
import random

def generate_random_datetimes(start, count, max_minutes_range):
    """
    # Function to generate random datetime list
    # Parameters: start datetime, number of datetimes, max minutes range
    """
    return [start + timedelta(minutes=random.randint(0, max_minutes_range)) for _ in range(count)]


In [47]:
def test_datetime_algorithms():
    a_times = sorted(generate_random_datetimes(datetime(2025, 2, 1), 10, 1440 * 5))
    b_times = sorted(generate_random_datetimes(datetime(2025, 2, 1), 5, 1440 * 5))
    display(a_times, b_times)

test_datetime_algorithms()

[datetime.datetime(2025, 2, 1, 6, 1),
 datetime.datetime(2025, 2, 1, 17, 33),
 datetime.datetime(2025, 2, 2, 0, 3),
 datetime.datetime(2025, 2, 2, 3, 14),
 datetime.datetime(2025, 2, 2, 12, 38),
 datetime.datetime(2025, 2, 3, 23, 38),
 datetime.datetime(2025, 2, 4, 6, 8),
 datetime.datetime(2025, 2, 4, 11, 3),
 datetime.datetime(2025, 2, 4, 22, 30),
 datetime.datetime(2025, 2, 5, 23, 43)]

[datetime.datetime(2025, 2, 3, 8, 49),
 datetime.datetime(2025, 2, 4, 18, 54),
 datetime.datetime(2025, 2, 5, 11, 20),
 datetime.datetime(2025, 2, 5, 15, 31),
 datetime.datetime(2025, 2, 5, 21, 21)]

In [97]:
#| export
def find_tobii_realeye_df_pairs(
    tobii_dfs: list[pl.DataFrame], realeye_dfs: list[pl.DataFrame],
    *,
    _logging: bool = False
) -> list[tuple[pl.DataFrame, pl.DataFrame]]:
    """
    Algorithm

    0. Prep: dfs -> tuples of starting time and df, to ease the comparison function
        - alt: write a comparator that accesses that property
    1. Brutal force
        1. For each tobii session, find the closest RealEye session
        2. For logging purposes, articulate the `pl.Duration` or `datetime.TimeDelta`: `end - start` for RealEye
    2. More intelligent
        1. Pop found indices from the available indices to search, so you don't double-select a `DataFrame`
        2. Use the `source_tsv` property as a key into a dictionary, to store the found DataFrame once its found
            - or its index. Doesn't really matter
    """
    from datetime import datetime, timedelta
    times_tobii = [df["timestamp"][0] for df in tobii_dfs]
    times_realeye = [df["timestamp"][0] for df in realeye_dfs]
    # tobii is the reference, realeye is under scrutiny
    # 50 sec is the shortest time between Tobii "Start record", skipping validation, and starting RealEye
    _MIN_TIME_DELTA = timedelta(seconds=50)
    found_indices: set[int] = set() # just because I don't want to order it
    pair_indices = []
    for tobii_time in times_tobii:
        current_min_time_diff = timedelta(days=1_000) # more time than is sensible
        found_index = -1
        for i, re_time in enumerate(times_realeye):
            latest_diff = re_time - tobii_time
            if (i not in found_indices) and (_MIN_TIME_DELTA <= latest_diff < current_min_time_diff):
                found_index = i # we've found it. Don't need to log it
                current_min_time_diff = latest_diff
                _logging and print(f"Changed: {current_min_time_diff = }")
        found_indices.add(found_index)
        pair_indices.append(found_index)

    result = [
        (tobii_df.sort("timestamp"), realeye_dfs[re_index].sort("timestamp")) 
        for re_index, tobii_df in zip(pair_indices, tobii_dfs)
    ]
    return result

In [98]:
def test_pairing_algo():
    "Testing whether the algorithm works well. (Hint: it does)"
    # this works
    pairs = find_tobii_realeye_df_pairs(all_tobii_new_year_in_order, all_realeye_new_year_in_order_counted_backward)

    show = list(
        pl.concat(
            [re, tob.rename({"timestamp": "ts_tobii", "X": "X_tobii", "Y": "Y_tobii"})],
            how="horizontal",
        )["ts_tobii", "timestamp", "X_tobii", "Y_tobii", "X", "Y", "source_tsv"].head(1)
        for (tob, re) in pairs
    )
    display(show)

In [99]:
test_pairing_algo()

[shape: (1, 7)
 ┌───────────────────┬─────────────────────┬─────────┬─────────┬─────┬─────┬────────────────────┐
 │ ts_tobii          ┆ timestamp           ┆ X_tobii ┆ Y_tobii ┆ X   ┆ Y   ┆ source_tsv         │
 │ ---               ┆ ---                 ┆ ---     ┆ ---     ┆ --- ┆ --- ┆ ---                │
 │ datetime[μs, UTC] ┆ datetime[μs, UTC]   ┆ i32     ┆ i32     ┆ i64 ┆ i64 ┆ str                │
 ╞═══════════════════╪═════════════════════╪═════════╪═════════╪═════╪═════╪════════════════════╡
 │ 2025-02-19        ┆ 2025-02-19          ┆ null    ┆ null    ┆ 960 ┆ 590 ┆ 2025-02-19 Pikachu │
 │ 20:06:09.486 UTC  ┆ 20:12:56.435769 UTC ┆         ┆         ┆     ┆     ┆                    │
 └───────────────────┴─────────────────────┴─────────┴─────────┴─────┴─────┴────────────────────┘,
 shape: (1, 7)
 ┌─────────────────────┬─────────────────────┬─────────┬─────────┬──────┬─────┬─────────────────────┐
 │ ts_tobii            ┆ timestamp           ┆ X_tobii ┆ Y_tobii ┆ X    ┆ Y   ┆ sou

## ACTUALLY EXPORTING THE DATA

In [51]:
#| export 
from RevChem.common import dt_str_now, date_str_now

In [104]:
#| export
def export_ordered_pairs(
    df_pairs: list[tuple[pl.DataFrame, pl.DataFrame]],
    *,
    output_dir_root: Path,
    output_suffix: Path,
    include_joined_output: bool = False,
    tobii_export_naming: dict[str, str] = {
        "timestamp": "timestamp_tobii",
        "X": "X_tobii",
        "Y": "Y_tobii",
    },
    exported_column_names=[
        "timestamp",
        "X",
        "Y",
    ],
):
    # exported_column_names += list(sorted(tobii_export_naming.values()))
    export_dir = output_dir_root / output_suffix
    for tobii_df, re_df in df_pairs:
        # name the directory after the trial date + subject
        trial_subject_date_and_name = tobii_df["source_tsv"][0]
        try:
            outputdir = export_dir / trial_subject_date_and_name
            outputdir.mkdir(exist_ok=False, parents=True)

            re_df[exported_column_names].write_csv(outputdir / "realeye.csv")
            tobii_df[exported_column_names].write_csv(outputdir / "tobii.csv")
            if include_joined_output:
                pl.concat(
                    [re_df, tobii_df.rename(tobii_export_naming)],
                    how="horizontal",
                ).write_csv(outputdir / "joined.csv")
        except Exception as e:
            print(f"Unexpected exception {e = }, {type(e) = }")
            print(f"{outputdir = }")

In [105]:
EXPORT_ROOT = Path(DATA_ROOT, "..", f"{date_str_now()}-python-outputs").resolve()

In [106]:
def run_data_export():
    pairs = find_tobii_realeye_df_pairs(
        all_tobii_new_year_in_order, all_realeye_new_year_in_order_counted_backward
    )
    suffix = Path(f"{dt_str_now()}-counting-backwards")
    export_ordered_pairs(
        pairs,
        output_dir_root=EXPORT_ROOT,
        output_suffix=suffix,
        include_joined_output=True,
    )

run_data_export()

## A cleaner pairing algorithm
It's written in generic form and mostly pulled from the Internet, but I've sown it together

In [209]:
import numpy as np
from scipy.optimize import linear_sum_assignment
from scipy.sparse.csgraph import min_weight_full_bipartite_matching
from scipy.sparse import csr_matrix
from datetime import datetime, timedelta
from typing import Callable, Optional, TypeVar


T_Time_Haver = TypeVar("T_Time_Haver")
type TimeFieldGetter = Callable[[T_Time_Haver], datetime | "time"]


def time_based_matching(
    list1: list[T_Time_Haver],
    list2: list[T_Time_Haver],
    time_field_getter: TimeFieldGetter,
    max_time_delta: Optional[timedelta] = None,
    penalty_multiplier: float = 1.0,
) -> list[tuple]:
    """
    Find optimal time-based matching minimizing start_time differences.

    Args:
        list1, list2: Lists of objects with time fields
        time_field: Name of the time attribute
        max_time_delta: Maximum allowed time difference (None = no limit)
        penalty_multiplier: Scale factor for time difference penalties

    Returns:
        List of (item1, item2, cost) tuples for matched pairs
    """

    n1, n2 = len(list1), len(list2)

    # Create cost matrix
    cost_matrix = np.full((n1, n2), np.inf)

    for i, item1 in enumerate(list1):
        time1 = time_field_getter(item1)
        for j, item2 in enumerate(list2):
            time2 = time_field_getter(item2)

            time_diff = abs((time1 - time2).total_seconds())

            # Apply time delta constraint if specified
            if max_time_delta and time_diff > max_time_delta.total_seconds():
                continue  # Leave as inf (no match allowed)

            # Cost is the time difference (scaled)
            cost_matrix[i, j] = time_diff * penalty_multiplier

    # Solve using Hungarian algorithm
    row_indices, col_indices = linear_sum_assignment(cost_matrix)

    # Build results
    results = []
    total_cost = 0

    for i, j in zip(row_indices, col_indices):
        if cost_matrix[i, j] < np.inf:
            cost = cost_matrix[i, j] / penalty_multiplier  # Convert back to seconds
            results.append((list1[i], list2[j], cost))
            total_cost += cost

    return results, total_cost

In [210]:
# Example usage
from dataclasses import dataclass
from datetime import datetime, timedelta
from operator import attrgetter

@dataclass
class Event:
    name: str
    start_time: datetime

# Sample data
events_a = [
    Event("Meeting A", datetime(2025, 6, 10, 9, 0)),
    Event("Meeting B", datetime(2025, 6, 10, 14, 0)),
    Event("Meeting C", datetime(2025, 6, 10, 16, 30))
]

events_b = [
    Event("Room 1", datetime(2025, 6, 10, 9, 15)),    # 15 min diff
    Event("Room 2", datetime(2025, 6, 10, 14, 30)),   # 30 min diff  
    Event("Room X", datetime(2025, 6, 10, 14, 28)),   # 28 min diff  
    Event("Room 3", datetime(2025, 6, 10, 16, 0))     # 30 min diff
]

# Find optimal matching
matches, total_cost = time_based_matching(
    events_a, events_b,
    attrgetter("start_time"),
    max_time_delta=timedelta(hours=1),
    penalty_multiplier=1.0
)

for event_a, event_b, time_diff in matches:
    print(f"{event_a.name} -> {event_b.name}: {time_diff/60:.1f} min difference")


Meeting A -> Room 1: 15.0 min difference
Meeting B -> Room X: 28.0 min difference
Meeting C -> Room 3: 30.0 min difference


## A fully typed approach to parsing and loading RealEye data

In [119]:
raw_gazes_csv.head()

participant_id,item_id,test_created_at,test_raw_data
str,str,datetime[μs],str
"""62e1d6db-e570-4194-b792-1fc4ef…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2025-02-26 20:12:00,"""[[973,475,28,0,1588,1079],[917…"
"""a19074f0-818d-42d2-8d29-290429…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2025-02-26 21:38:52,"""[[902,599,36,0,1484,1077],[938…"
"""5f08f1c2-e9f3-4adb-9fb4-a68b3a…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-23 18:37:18,"""[[1085,721,36,0,1919,593],[102…"
"""105a3ff2-f0de-4d4b-adaa-feddee…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-23 20:12:28,"""[[1054,543,51,0,897,551],[1052…"
"""7fdc2add-bd15-4a04-b77b-242e5b…","""5b8637a5-c1c7-47cc-a1b4-5abe24…",2024-10-25 20:53:25,"""[[938,648,44,0,1919,400],[982,…"


In [None]:
#| export
from typing import Iterator
from RevChem.realeye import GazeInfo, iter_parse_raw_data

T_source = TypeVar("T_source")
T_out = TypeVar("T_out")


class Resettable(Iterable[T_out]):
    def __init__(
        self, source_data: T_source, iter_gen: Callable[[T_source], Iterable[T_out]]
    ):
        self._source = source_data
        self.iter_gen = iter_gen

    def __iter__(self) -> Iterable[tuple[int, int, int, float, float, float]]:
        return self.iter_gen(self._source)


def iter_parse_raw_to_GazeInfo(raw_data: str) -> Iterator[GazeInfo]:
    for sextuple in iter_parse_raw_data(raw_data):
        if (length_ := len(sextuple)) > 6:
            # indicates that the RealEye system captured a mouse click. Nothing more.
            print(f"Got a {length_}-tuple: {sextuple = }")
        yield GazeInfo(*sextuple[:6])


def resettable_iter_raw(test_raw_data: str):
    return Resettable(test_raw_data, iter_parse_raw_to_GazeInfo)


@dataclass
class RealEyeRawRow:
    participant_id: str  # the participant being tested
    item_id: str  # the stimulus being shown
    test_created_at: datetime  # when the web browser started running (NOT THE SAME AS THE STIMULUS START TIME)
    raw_data: Iterable[GazeInfo] = None

    def __post_init__(self):
        self.raw_data = resettable_iter_raw(self.raw_data)

    @classmethod
    def from_row_tuples(cls, tuple) -> "RealEyeRawRow":
        return cls(*tuple)

In [144]:
def fiddle_realeye_one():
    raw_row: RealEyeRawRow = RealEyeRawRow.from_row_tuples(
        raw_gazes_csv.row(0)
    )
    # first_ = first(list(raw_row.raw_data))
    # print(f"{type(first_) = }")
    # print(first_)
    for sextuple in raw_row.raw_data:
        print(sextuple)
        break

fiddle_realeye_one()

GazeInfo(gaze_point_X=973, gaze_point_Y=475, time_ms_since_start=28, scroll_offset_Y=0, mouse_pos_X=1588, mouse_pos_Y=1079)


In [147]:
?pl.DataFrame

In [161]:
item_ids_in_order_LIST = item_ids_in_order.to_list()

In [None]:
#| export
def cumulative_sum(items: list[int|float]) -> list[int|float]:
    """Calculate the cumulative sum up to and including a given index"""
    csum = 0
    res = [None] * len(items)
    for i, item in enumerate(items):
        csum += item
        res[i] = csum
    return res

In [None]:

def test_cumulative_sum():
    t1 = [0, 1, 2, 3]
    r1 = [0, 1, 3, 6]
    assert (a1:=cumulative_sum(t1)) == r1, f"Cumulative sum was incorrect: {a1} != {r1}"

    t2 = [0, 2, 4, 6, 8]
    r2 = [0, 2, 6, 12, 20]
    assert (a2:=cumulative_sum(t2)) == r2, f"Cumulative sum was incorrect: {a2} != {r2}"

    t3 = [1, 3, 5, 7, 9]
    r3 = [1, 4, 9, 16, 25]
    assert (a3:=cumulative_sum(t3)) == r3, f"Cumulative sum was incorrect: {a3} != {r3}"

test_cumulative_sum()

In [224]:
# | export
def raw_gazes_row_to_df(
    row: RealEyeRawRow, # typed row from the CSV. should have few, if any changes, from the raw CSV file. Used for semantic tidyness
    *,
    time_since_name: str = "time_since_start", # new name given to the column that records the time (ms) since this stimulus was shown
    x_name: str = "X", # new name given to the column that captures the X-coordinate of the GazeInfo gaze
    y_name: str = "Y", # new name given to the column that captures the Y-coordinate of the GazeInfo gaze
) -> pl.DataFrame:
    df = pl.DataFrame(
        [
            (
                row.test_created_at,
                gaze_info.time_ms_since_start,
                gaze_info.gaze_point_X,
                gaze_info.gaze_point_Y,
            )
            for gaze_info in row.raw_data
        ],
        schema={
            "test_created_at": pl.Datetime,
            time_since_name: pl.Int32,
            "X": pl.Int32,
            "Y": pl.Int32,
        },
        orient="row",
    )
    return df.with_columns( # force everything to be UTC because that's what it should be (per docs)
        pl.col("test_created_at").dt.replace_time_zone(time_zone="UTC"),
    )

In [215]:
#| export
def run_realeye_df_group_statistics(dfs: list[pl.DataFrame]):
    grouped = group_by(lambda df: df["test_created_at"][0], dfs)
    group_statistics = pl.DataFrame(
        [[group_key, len(group)] for group_key, group in grouped.items()],
        schema=["test_created_at", "n_rows"],
        orient="row",
    )  # .sort("test_created_at")
    with pl.Config(tbl_rows=50):
        display(group_statistics)
    group_statistics.write_csv(EXPORT_ROOT / f"{dt_str_now()}-row_stats.csv")

In [188]:
#| export
def realeye_timestamp_to_datetime(
    datetime_col: str = "test_created_at", # column with the recording start datetime
    timestamp_col: str = "time_ms_since_start", # the integer column representing the milliseconds since stimulus exposure
    *,
    overwrite: bool = True, # Whether, the `timestamp_col` will have a datetime type in the result, or (if False) a new column is created
    additional_offset_ms: int = 0, # Additional offset to 
) -> pl.DataFrame:
    """Update `timestamp_col` to be an increasing datetime rather than the default (i64 or int).

    Corrects the `timestamp_col` of `df` to be a `pl.Datetime`, to ease legibility and computation.
    Sums the `timestamp_col` with that of the reference `datetime_col`, incrementing the time forward.

    Returns:
        A dataframe with the described change to the `timestamp_col`
    """
    new_name = timestamp_col if overwrite else f"{timestamp_col}__new_dt_time"
    new_column = pl.col(datetime_col) + pl.duration(milliseconds=timestamp_col) + pl.duration(milliseconds=additional_offset_ms)
    new_column = new_column.alias(new_name)

    return new_column

In [232]:
# | export

def correct_realeye_df_group(
    group_dfs: list[pl.DataFrame], *, time_col: str = "time_since_start"
):
    """In-place mutation to correct the dfs' timing, assuming dfs are aggregated by `test_created_at`"""
    # 1. sum the last|largest millisecond offset from each of the dfs
    group_millisecond_offset_maxes = [df[time_col].max() for df in group_dfs]
    total_milliseconds_since_start = sum(group_millisecond_offset_maxes)
    # 1a. assume we have fully contiguous time series
    # 2. compute the start time: "test_created_at" - total relative milliseconds
    # Using min (not max), because the file outputs take time, even though the recording is done,
    # and the trial is started earlier than the one-ish sec it takes to create the output.
    re_recording_end = min(map(lambda df: df["test_created_at"][0], group_dfs))
    re_recording_start = re_recording_end - timedelta(
        microseconds=total_milliseconds_since_start * 1000
    )
    # 3. roll the relative milliseconds forward for each subsequent DataFrame
    # this is like a "scan" or "cummulative sum"
    # We shift everything "left" one, because the first doesn't need anything additional
    # The second df need only add the first, third df only add the two prior, etc.
    addend_group_millisecond_offsets = [0] + cumulative_sum(
        group_millisecond_offset_maxes[:-1]
    )
    # update the dfs
    for group_member_index in range(len(group_dfs)):
        df = group_dfs[group_member_index].with_columns(
            __temp_start_time=re_recording_start
        )
        group_dfs[group_member_index] = df.with_columns(
            realeye_timestamp_to_datetime(
                datetime_col="__temp_start_time",
                timestamp_col=time_col,
                additional_offset_ms=addend_group_millisecond_offsets[
                    group_member_index
                ],
            )
        ).drop("__temp_start_time")

In [None]:
# | export


# TODO: rename realeye data pipeline function
def pipeline_raw_realeye_to_timed_dataframe(
    re_raw_df: pl.DataFrame,  # result of pl.read_csv("raw-gazes.csv").
    *,
    do_group_stats_export: bool = False,  # whether compute early stats, write them to EXPORT_ROOT and exit early. Fails if EXPORT_ROOT is undefined.
    debug: bool = False,  # whether we output the first row of each dataframe, to debug what we're looking at.
    dt_timestamp_col: str = "time_since_start",  # name for the datetime timestamp column in the output dataframes
):
    real_eye_rows: list[RealEyeRawRow] = sorted(
        map(RealEyeRawRow.from_row_tuples, re_raw_df.rows()),
        # sorted by item_id, leveraging that list.index(...) -> ordinal position
        key=lambda re_row: item_ids_in_order_LIST.index(re_row.item_id),
    )
    # rows to dataframe
    dfs = [
        raw_gazes_row_to_df(row, time_since_name=dt_timestamp_col)
        for row in real_eye_rows
    ]

    if do_group_stats_export:
        run_realeye_df_group_statistics(dfs)
        return

    if debug:
        display(pl.concat([df.head(1) for df in dfs]))

    # group by "creation" time, down to the second, to get first ordering
    # then flatten so we have an overall sequence with subsequences which are in order.
    # Sorting is performed to make sure the test_created_at and test_created_at+1sec are in the correct order.
    dfs = list_concat(
        sorted(
            group_by(lambda df: df["test_created_at"][0], dfs).values(),
            key=lambda dfs: dfs[0]["test_created_at"][0],
        )
    )
    # group by the minute, order is retained within the group
    # giving us groups that put all entries of a given trial in the right order
    # even if split by a single second, they are collected in the correct stimulus order
    # and are results are output within a minute of each other.
    grouped = group_by(
        lambda df: df["test_created_at"][0].replace(second=0, microsecond=0), dfs
    )
    # now we can apply the timestamp correction algorithm to the groups
    for group_start_minute, group_dfs in grouped.items():
        correct_realeye_df_group(group_dfs, time_col=dt_timestamp_col)

    # lastly, concatenate all of the groups, now that their time columns are fixed
    mapped = {
        group_start_minute: pl.concat(group_dfs)
        for group_start_minute, group_dfs in grouped.items()
    }

    return mapped


Got a 7-tuple: sextuple = [857, 857, 4249, 0, 944, 580, 1]
Got a 7-tuple: sextuple = [1416, 209, 27024, 0, 1919, 1037, 1]
Got a 7-tuple: sextuple = [545, 1043, 3167, 0, 665, 932, 1]
Got a 7-tuple: sextuple = [237, 267, 40087, 0, 1172, 701, 1]
Got a 7-tuple: sextuple = [804, 475, 40287, 0, 1172, 701, 1]


{datetime.datetime(2024, 10, 23, 18, 37, tzinfo=zoneinfo.ZoneInfo(key='UTC')): shape: (7_279, 4)
 ┌─────────────────────────┬─────────────────────────────┬──────┬─────┐
 │ test_created_at         ┆ time_since_start            ┆ X    ┆ Y   │
 │ ---                     ┆ ---                         ┆ ---  ┆ --- │
 │ datetime[μs, UTC]       ┆ datetime[μs, UTC]           ┆ i32  ┆ i32 │
 ╞═════════════════════════╪═════════════════════════════╪══════╪═════╡
 │ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.015 UTC ┆ 1085 ┆ 721 │
 │ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.042 UTC ┆ 1027 ┆ 765 │
 │ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.075 UTC ┆ 986  ┆ 776 │
 │ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.106 UTC ┆ 1284 ┆ 729 │
 │ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.143 UTC ┆ 1058 ┆ 842 │
 │ …                       ┆ …                           ┆ …    ┆ …   │
 │ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:37:17.876 UTC ┆ 847  ┆ -36 │
 │ 2024-10-23 18:37:18 UTC ┆ 2024-10-23

In [None]:

pipeline_raw_realeye_to_timed_dataframe(raw_gazes_csv)

# Fiddling with new things

## Redoing the pairing
Cleaner this time

In [241]:
# full Tobii pipeline is just a few lines of code
# TODO
def pipeline_tobii_directory_to_all_dfs(
    directory_of_individual_tobii_sessions: str,
    *,
    columns_subset: list[str] = COLUMNS_TOBII,
    column_renaming: dict[str, str] = {},
) -> list[pl.DataFrame]:
    all_tobii_dfs = [
        read_tobii_individual_tsv(tsv_file)
        .with_columns(tobii_timestamp_to_datetime(datetime_col="start_dt_utc"))[
            columns_subset
        ]
        .rename(column_renaming)
        .with_columns(source_tsv=pl.lit(clean_tsv_file_name(tsv_file.name)))
        for tsv_file in Path(directory_of_individual_tobii_sessions).iterdir()
    ]
    all_tobii_dfs = filter_tobii_dfs_by_new_years_heuristics(
        # filter_to_newyear_and_sort_by_timestamp(all_tobii_data_as_exportable)
        all_tobii_dfs
    )

    return all_tobii_dfs

In [244]:
def test_tobii_pipeline():
    tobii_dfs = pipeline_tobii_directory_to_all_dfs(
        TOBII_ROOT, column_renaming=COLUMN_RENAMING_TOBII_TO_CSV
    )
    print(tobii_dfs[0])

test_tobii_pipeline()

1 dfs failed the 0-th criterion "len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ"
The following previews show the dfs that missed the criteria: len(df) >= 51k, corresponding to >= 7 minutes of recording @ 120-HZ
shape: (2, 5)
┌────────────────────────────────┬──────┬──────┬──────────────────┬───────┐
│ timestamp                      ┆ X    ┆ Y    ┆ source_tsv       ┆ count │
│ ---                            ┆ ---  ┆ ---  ┆ ---              ┆ ---   │
│ datetime[μs, UTC]              ┆ i32  ┆ i32  ┆ str              ┆ i32   │
╞════════════════════════════════╪══════╪══════╪══════════════════╪═══════╡
│ 2025-02-27 23:36:21.203 UTC    ┆ null ┆ null ┆ 2025_2_27_Pichu2 ┆ 9934  │
│ 2025-02-27 23:36:21.325819 UTC ┆ null ┆ null ┆ 2025_2_27_Pichu2 ┆ 9934  │
└────────────────────────────────┴──────┴──────┴──────────────────┴───────┘
shape: (55_856, 4)
┌────────────────────────────────┬──────┬──────┬──────────────────────┐
│ timestamp                      ┆ X    ┆ Y    ┆ source

In [245]:
EXPORT_ROOT = Path(DATA_ROOT, "..", f"{date_str_now()}-python-outputs").resolve()


# new way
def run_data_export_NEW():
    # Not less code in itself, but the full pipeine is right before your eyes! That's better than call strewn about
    print("RE processing...")
    realeye_dfs = list(
        pipeline_raw_realeye_to_timed_dataframe(
            raw_gazes_csv, dt_timestamp_col="timestamp"
        ).values(),
    )
    print("RE done. Sample:")
    print(realeye_dfs[0].head(2))
    print("Tobii processing...")
    tobii_dfs = pipeline_tobii_directory_to_all_dfs(
        TOBII_ROOT, column_renaming=COLUMN_RENAMING_TOBII_TO_CSV
    )
    print("Tobii done. Sample:")
    print(tobii_dfs[0].head(2))

    print("\n\nPairing")
    pairs = find_tobii_realeye_df_pairs(tobii_dfs, realeye_dfs)

    print("Pairing complete. Exporting")
    export_ordered_pairs(
        pairs,
        output_dir_root=EXPORT_ROOT,
        output_suffix=Path(f"{dt_str_now()}-new-reconciliation"),
        include_joined_output=True,
    )


run_data_export_NEW()

RE processing...
Got a 7-tuple: sextuple = [857, 857, 4249, 0, 944, 580, 1]
Got a 7-tuple: sextuple = [1416, 209, 27024, 0, 1919, 1037, 1]
Got a 7-tuple: sextuple = [545, 1043, 3167, 0, 665, 932, 1]
Got a 7-tuple: sextuple = [237, 267, 40087, 0, 1172, 701, 1]
Got a 7-tuple: sextuple = [804, 475, 40287, 0, 1172, 701, 1]
RE done. Sample:
shape: (2, 4)
┌─────────────────────────┬─────────────────────────────┬──────┬─────┐
│ test_created_at         ┆ timestamp                   ┆ X    ┆ Y   │
│ ---                     ┆ ---                         ┆ ---  ┆ --- │
│ datetime[μs, UTC]       ┆ datetime[μs, UTC]           ┆ i32  ┆ i32 │
╞═════════════════════════╪═════════════════════════════╪══════╪═════╡
│ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.015 UTC ┆ 1085 ┆ 721 │
│ 2024-10-23 18:37:18 UTC ┆ 2024-10-23 18:33:24.042 UTC ┆ 1027 ┆ 765 │
└─────────────────────────┴─────────────────────────────┴──────┴─────┘
Tobii processing...
1 dfs failed the 0-th criterion "len(df) >= 51k, correspondi

In [229]:
EXPORT_ROOT

PosixPath('/Users/stephen/dev/RevChemData/2025-06-12-python-outputs')

## Triangle-only data

In [345]:
triangular_root = Path("/Users/stephen/dev/RevChemData/", "20250617-Triangle-coords")

In [347]:
def fiddling_triangle():
    """Show that there are no values triangle-related AOI columns"""
    # df = read_tobii_individual_tsv(
    #     Path(DATA_ROOT,  "..", "20250617-Triangle-coords", "1.Realeye1,2,3 2025-02-19 Pikachu.tsv")
    # )
    triangle_files = [
        Path(DATA_ROOT,  "..", "20250617-Triangle-coords", "1.Realeye1,2,3 2025-02-19 Pikachu.tsv")
    ]
    # triangle_files = Path(DATA_ROOT,  "..", "20250617-Triangle-coords",).iterdir()
    for file in (f for f in triangle_files if "test.tsv" not in f.name): # test.tsv has no contents.
        df = read_tobii_individual_tsv(file)
        columns_tri = [col for col in df.columns if ("triangle" in col) or ("fix" in col.casefold())]
        df_assess = df[
            ["Recording timestamp", "Gaze point X", "Gaze point Y"] + columns_tri
        ]
        for col in columns_tri:
            df_filter = df_assess.filter(pl.col(col).is_not_null())
            if df_filter.shape[0]: # if we have at least one record, show us what you have.
                display(f"File '{file}' - column '{col}'")
                display(df_filter)


fiddling_triangle()