# Image generation by segments
The goal of this notebook is to demonstrate a new capability that we've made possible: pixel-by-pixel replay of a RealEye trial
Stretch goal is to show Tobii as well.

**Why aren't they together, when you have code to join them?**
- New code would have be written to segment the Tobii data parallel to the RealEye data. Such would be a simple "nearest" join, then filtering `nulls` but RealEye has to take the lead and the Tobii and RealEye data need to paired already.
    - Format-agnostic pairing has not been done
    - Stapling indices into the existing code would be hacky and error-prone
    - This will result in something that is more clear.

In [None]:
#| default_exp data.timeseries

In [1]:
#|export
import polars as pl
from pathlib import Path

In [None]:
from RevChem.data.export import read_chunks_from_json

# Loading JSON  store of the RealEye and Tobii pairs in a format that's fast to read: 4 seconds for 35 trials.
# Data written as JSON to path: /Users/stephen/dev/RevChemData/2025-07-17-python-outputs/202507171142-matches-with-TCA.json.gz
# NOTE: this is ALL of the data, not just a single trial. More compact, but slightly less expressive.
associated_tobii_re_sequences = read_chunks_from_json(
    Path("~/dev/RevChemData/2025-07-17-python-outputs/202507171142-matches-with-TCA.json.gz").expanduser(),
)

In [None]:
# | export

from typing import NamedTuple


class AssociatedTrialSegements(NamedTuple):
    trial_name_or_id: str
    segments: list[pl.DataFrame]


def join_chunks_as_segments(
    associated_chunks: list[tuple[pl.DataFrame, list[pl.DataFrame]]],
    *,
    join_strategy="backward",
    drop_null=False,
) -> list[AssociatedTrialSegements]:
    """Transform a "Chunk and associated list" to "list of associated chunks"

    Algorithm:
        given a list of tuple[pl.DataFrame, list[pl.DataFrame]] representing tobii and RealEye, resp.
        for each RE dataframe
            join with Tobii "master" frame on the "timestamp" column
            - "how" should be something like "nearest", or "between" the start and end of the RE df in question
            - rename the RE columns "X_re" and "Y_re"
            - drop the "test_created_at" column
            - filter all the nulls, and those outside of the time bounds of the RE df

    Arguments:
        associated_chunks: list of matched tobii dataframe with all the RE dataframes per stimulus

    Returns:
        subsegments of the Tobii df joined on the time column of the RE df, per the algorithm
    """
    output = []
    for tobii_df, re_dfs in associated_chunks:
        trial_name = tobii_df["source_tsv"][0]
        re_rename = dict(X="X_re", Y="Y_re")
        tobii_df = tobii_df.drop("source_tsv")
        segmented_associations = []
        for re_df in re_dfs:
            # NOTE: may need to use the `tolerance` kwarg to better tune the match-up
            associated = tobii_df.join_asof(
                re_df.drop("test_created_at").rename(re_rename),
                on="timestamp",
                strategy=join_strategy,
            )
            associated = associated.filter(
                (pl.col("timestamp") >= re_df["timestamp"].min())
                & (pl.col("timestamp") <= re_df["timestamp"].max())
            )
            # associated.drop_nulls(["X", "Y"])
            if drop_null:
                # NOTE: important to make sure Tobii's nothing don't crowd in
                # NOTE: this can be 10% of the data and shouldn't be done lightly.
                associated = associated.drop_nulls()
            segmented_associations.append(associated)

        output.append(AssociatedTrialSegements(trial_name, segmented_associations))
    return output


def test_chunk_assoc():
    first_joined = join_chunks_as_segments(associated_tobii_re_sequences[:1], join_strategy="backward")
    trial_name, segmented_associations = first_joined[0]

    print(f"For trial {trial_name}")
    with pl.Config(tbl_rows=30):
        print(segmented_associations)


In [None]:
#| hide
test_chunk_assoc()

For trial 2025-03-07-Cyndaquil
[shape: (117, 5)
┌────────────────────────────┬──────┬──────┬──────┬──────┐
│ timestamp                  ┆ X    ┆ Y    ┆ X_re ┆ Y_re │
│ ---                        ┆ ---  ┆ ---  ┆ ---  ┆ ---  │
│ datetime[μs]               ┆ i32  ┆ i32  ┆ i32  ┆ i32  │
╞════════════════════════════╪══════╪══════╪══════╪══════╡
│ 2025-03-07 18:47:07.934088 ┆ 912  ┆ 684  ┆ 956  ┆ 547  │
│ 2025-03-07 18:47:07.942421 ┆ 914  ┆ 684  ┆ 956  ┆ 547  │
│ 2025-03-07 18:47:07.950754 ┆ 915  ┆ 684  ┆ 956  ┆ 547  │
│ 2025-03-07 18:47:07.959088 ┆ 915  ┆ 692  ┆ 892  ┆ 514  │
│ 2025-03-07 18:47:07.967421 ┆ 911  ┆ 691  ┆ 892  ┆ 514  │
│ 2025-03-07 18:47:07.975754 ┆ 912  ┆ 688  ┆ 892  ┆ 514  │
│ 2025-03-07 18:47:07.984088 ┆ 913  ┆ 684  ┆ 892  ┆ 514  │
│ 2025-03-07 18:47:07.992421 ┆ 914  ┆ 682  ┆ 894  ┆ 513  │
│ 2025-03-07 18:47:08.000754 ┆ 915  ┆ 676  ┆ 894  ┆ 513  │
│ 2025-03-07 18:47:08.009088 ┆ 917  ┆ 679  ┆ 894  ┆ 513  │
│ 2025-03-07 18:47:08.017421 ┆ 918  ┆ 679  ┆ 894  ┆ 513  │
│ 2025-0

In [46]:
stimuli_paths

[PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_00_visual_reset.jpg'),
 PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_01_visual_reset.jpg'),
 PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_02_Triangle.jpg'),
 PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_03_visual_reset.jpg'),
 PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_04_ocean_question.jpg'),
 PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_05_confidence.jpg'),
 PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_06_ocean_reasoning.jpg'),
 PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_07_visual_reset.jpg'),
 PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_08_molecule_question.jpg'),
 PosixPath('/Users/stephen/dev/RevChem-Stimuli/jpegs/stimulus_09_confidence.jpg')]

## A note on timings of the RealEye-Tobii trial's RealEye portion
Kathy says the triangle gets 5 seconds, the other two stimulis get 30 seconds.
Control trial says: 
* Frame between stimuli is 1 second
* Triangle gets 5 seconds
* Stimulus (questions) get 60 seconds
* Stimulus (confidence) get 8 or 9 seconds
* Stimulus (reasoning) get 45 seconds

I can work with that and get approximate timing down, to then map the RealEye buckets

In [None]:
from itertools import takewhile
from typing import Iterable


def take_n(iterable, *, n=5) -> Iterable:
    count = 0
    for x in takewhile(lambda _: count < n, iterable):
        yield x
        count += 1
    
def test_take_n():
    result = list(take_n(range(10), n=5))
    assert result == list(range(5))
    result = list(take_n(range(10), n=4))
    assert result == list(range(4))

In [25]:
test_take_n()

In [76]:
from matplotlib import animation, pyplot as plt
import numpy as np


def plot_point_stream_and_animate(
    stimulus_image_paths: list[str],
    tobii_re_merge: AssociatedTrialSegements,
    *,
    # num_frames=240,
    encode_fps=30,  # 4 minutes at 1 point per second
    outputfilename="animation",
    do_blit=True,
    export_fps=30,
    n_images_to_show: int = None,
):
    if n_images_to_show is None:
        n_images_to_show = max(len(stimulus_image_paths), len(tobii_re_merge.segments))

    IMAGE_WIDTH, IMAGE_HEIGHT = (1920, 1080)
    fig, ax = plt.subplots()

    # Set axis limits EXACTLY to the image dimensions.
    ax.set_xlim(0, IMAGE_WIDTH)
    ax.set_ylim(IMAGE_HEIGHT, 0)  # Inverted Y-axis for image coordinates

    # Turn off the axis labels, ticks, etc.
    # ax.axis("off")

    image_artist = ax.imshow(np.zeros((IMAGE_HEIGHT, IMAGE_WIDTH, 3)))

    # Create scatter plots for red and blue points
    red_scatter = ax.scatter([], [], c="red", s=100, marker="o")
    blue_scatter = ax.scatter([], [], c="blue", s=100, marker="o")

    # animation secquencer: rather than creating multiple animation objects, we'll change the state animator is tracking.
    def animation_sequenced_images_and_frames():
        for img_path, tobii_re_xy in take_n(
            zip(stimulus_image_paths, tobii_re_merge.segments), n=n_images_to_show
        ):
            # Load the background image
            image_data = plt.imread(img_path)
            print(f"Loaded an image: {img_path = }")
            should_render_image_anew = True
            for tobii_re_row_data in tobii_re_xy.iter_rows():
                yield should_render_image_anew, image_data, tobii_re_row_data
                should_render_image_anew and print(f"Rendered {img_path}; set flag to {should_render_image_anew}.")
                should_render_image_anew = False

    # Update function for the animation
    def update(animation_state):
        shoulder_render_image_anew, image_data, tobii_re_xy_row = animation_state
        _, tobii_X, tobii_Y, re_X, re_Y = tobii_re_xy_row
        # Update the position of the red and blue points for the current frame
        red_scatter.set_offsets([re_X, re_Y])
        blue_scatter.set_offsets([tobii_X, tobii_Y])
        if shoulder_render_image_anew:
            image_artist.set_data(image_data)
        return image_artist, red_scatter, blue_scatter

    # Create the animation
    anim = animation.FuncAnimation(
        fig,
        func=update,
        frames=animation_sequenced_images_and_frames,
        # interval is the millisecond gap between frames: fps = 1000/interval <-> 1000/fps = interval
        interval=1000 / encode_fps,
        blit=do_blit,  # ?: should always be true?
    )

    # Save the animation as an MP4 file
    Path(outputfilename).parent.mkdir(exist_ok=True, parents=True)

    print("Writing the MP4...")
    written_to_disk = f"{outputfilename}.mp4"
    anim.save(written_to_disk, writer="ffmpeg", fps=export_fps)
    print(f"Wrote MP4 to disk: {written_to_disk}")

    plt.close(fig)  # Close the figure to free memory

In [60]:
stimuli_paths = sorted(Path("/Users/stephen/dev/RevChem-Stimuli/jpegs").glob("*.jpg"))

# we know that the stimuli are in the same order as the RealEye data
# problem is that we *don't* know which stimulus corresponds to which subset of the RealEye data

In [61]:
def select_trial(trial_name: str, joined_segments: list[AssociatedTrialSegements]):
    results = list(
        filter(lambda association: association.trial_name_or_id == trial_name, joined_segments)
    )
    if not results:
        raise ValueError(f"Unable to find trial with name {trial_name}")

    return results[0]

In [91]:
trial_joined_segments = join_chunks_as_segments(associated_tobii_re_sequences, join_strategy="nearest", drop_null=True)

In [79]:
%load_ext autoreload
%autoreload 2

In [95]:
import cv2

def render_point_stream_video_with_opencv(
    stimulus_image_paths: list[str],
    tobii_re_merge: AssociatedTrialSegements,
    *,
    export_fps=30,  # 4 minutes at 1 point per second
    output_file_name="animation",
    n_images_to_show: int = None,
):
    if n_images_to_show is None:
        n_images_to_show = len(stimulus_image_paths)

    IMAGE_WIDTH, IMAGE_HEIGHT = 1920, 1080
    TARGET_DIMS = (IMAGE_WIDTH, IMAGE_HEIGHT)
    # --- Define colors in BGR format ---
    # For RED, you want the Red channel (index 2) to be 255.
    COLOR_RED = (0, 0, 255) 
    # For BLUE, you want the Blue channel (index 0) to be 255.
    COLOR_BLUE = (255, 0, 0)

    # 1. Set up the VideoWriter object from OpenCV
    # 'mp4v' is a common codec for .mp4 files.
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_name_on_disk = f"{output_file_name}.mp4"
    video_writer = cv2.VideoWriter(output_name_on_disk, fourcc, export_fps, (IMAGE_WIDTH, IMAGE_HEIGHT))

    if not video_writer.isOpened():
        print("Error: Could not open video writer.")
        return

    # Pre-load all images into memory to avoid repeated disk I/O
    print("Pre-loading images...")
    images_in_memory = {path: cv2.imread(path) for path in stimulus_image_paths[:n_images_to_show]}
    print("Images loaded.")

    try:
        num_trials = min(n_images_to_show, len(stimulus_image_paths), len(tobii_re_merge.segments))
        
        # for i in range(num_trials):
        #     img_path = stimulus_image_paths[i]
        #     tobii_re_xy = tobii_re_merge.segments[i]
        for i_trial, img_path, tobii_re_xy in zip(range(num_trials), stimulus_image_paths, tobii_re_merge.segments):
            # Get the pre-loaded image data
            # NOTE: OpenCV loads images in BGR format, not RGB. Be mindful if matplotlib interactions...
            raw_image = images_in_memory[img_path]

            if raw_image is None:
                print(f"Warning: Could not load image {img_path}, skipping trial.")
                continue

            base_image = cv2.resize(raw_image, TARGET_DIMS, interpolation=cv2.INTER_AREA)
            print(f"Processing trial {i_trial+1}/{num_trials}...")
            
            # 2. Loop through the point data for this trial
            for _, tobii_X, tobii_Y, re_X, re_Y in tobii_re_xy.iter_rows():
                # --- This is the key performance difference ---
                # Create a fresh copy of the base image for this frame to draw on.
                # This is much faster than Matplotlib's full render pipeline.
                frame = base_image.copy()

                # Draw circles directly onto the NumPy array (the image frame).
                # This is a highly optimized C++ operation.
                # Point coordinates must be integers for cv2.
                # thickness = -1 fills the circle
                cv2.circle(frame, (int(re_X), int(re_Y)), radius=10, color=COLOR_RED, thickness=-1)
                cv2.circle(frame, (int(tobii_X), int(tobii_Y)), radius=10, color=COLOR_BLUE, thickness=-1)

                # 3. Write the modified frame to the video file
                video_writer.write(frame)

    finally:
        # 4. Release the video writer to finalize the file
        video_writer.release()
        print(f"Finished writing video to {output_name_on_disk}")



In [97]:
from time import perf_counter
from RevChem.common import date_str_now, dt_str_now


def run_video_generation(trial_name: str, *, n_stimuli=3):
    # prepared trial data that we're going to be turning into a video
    trial_data = select_trial(trial_name, trial_joined_segments)

    hz_tobii = 120
    export_fps = 60  # we can feel this but not see it precisely. 120 is ideal

    # sum the time of the segments based on the data in trial_segments.
    calculated_duration = (
        trial_data.segments[n_stimuli - 1]["timestamp"].max()
        - trial_data.segments[0]["timestamp"].min()
    ).total_seconds()

    print(f"""Exporting. 
FPS:      {export_fps}-fps (or as close as possible thereto)
Duration: {calculated_duration}-seconds
""")

    ts = dt_str_now()

    start_time = perf_counter()

    # plot_point_stream_and_animate(
    #     stimuli_paths,
    #     trial_data,
    #     encode_fps=hz_tobii,
    #     export_fps=export_fps,
    #     outputfilename=f"/Users/stephen/dev/RevChemData/{date_str_now()}-multi-image-video/{ts}-{trial_name}-timing-run",
    #     do_blit=False,
    #     n_images_to_show=n_stimuli,
    # )
    render_point_stream_video_with_opencv(
        stimuli_paths,
        trial_data,
        export_fps=export_fps,
        output_file_name=f"/Users/stephen/dev/RevChemData/{date_str_now()}-multi-image-video/{ts}-{trial_name}-timing-run",
        n_images_to_show=n_stimuli,
    )
    now = perf_counter()
    print(f"Full run (including image reload): {now - start_time} seconds")


for trial_name in [
    "2025-03-05_Blastoise",
    "2025-03-07-Cyndaquil",
    # "2025-03-11_Ninetail",
    # "2025-03-10-Dakrai"
]:
    run_video_generation(trial_name, n_stimuli=6)

Exporting. 
FPS:      60-fps (or as close as possible thereto)
Duration: 75.973259-seconds

Pre-loading images...
Images loaded.
Processing trial 1/6...
Processing trial 2/6...
Processing trial 3/6...
Processing trial 4/6...
Processing trial 5/6...
Processing trial 6/6...
Finished writing video to /Users/stephen/dev/RevChemData/2025-07-19-multi-image-video/2025-07-19-0400-2025-03-05_Blastoise-timing-run.mp4
Full run (including image reload): 101.71840411797166 seconds
Exporting. 
FPS:      60-fps (or as close as possible thereto)
Duration: 76.00651-seconds

Pre-loading images...
Images loaded.
Processing trial 1/6...
Processing trial 2/6...
Processing trial 3/6...
Processing trial 4/6...
Processing trial 5/6...
Processing trial 6/6...
Finished writing video to /Users/stephen/dev/RevChemData/2025-07-19-multi-image-video/2025-07-19-0402-2025-03-07-Cyndaquil-timing-run.mp4
Full run (including image reload): 99.50677785702283 seconds
