# Merging the the Dataframs
In this notebook merges the different data files for each session

### functions

In [None]:
import pandas as pd
import os

"""
This script contains helper functions for loading and matching data.
It matches the csv with the labels (results from SAM_SEGformer) to the fixation data(Pupil Labs Algorithm).

"""


def match_worldtime_label(dfw, dfl):
    """
    Match rows from df1 to corresponding rows in df2 based on the 'frame' column.

    Parameters:
    df1 (pd.DataFrame): First DataFrame with a 'frame' column.
    df2 (pd.DataFrame): Second DataFrame to match with df1.

    Returns:
    pd.DataFrame: A new DataFrame containing matched rows from df1 and df2.
    """
    m1 = dfw.merge(dfl, on="frame_nr", how="inner")
    m2 = dfl.merge(dfw, on="frame_nr", how="inner")
    return m1, m2


def match_fixation(df, dff):
    """
    Match rows from df with rows in dff based on start and end times.

    Parameters:
    df (pd.DataFrame): DataFrame to match with dff.
    dff (pd.DataFrame): Third DataFrame with start and end times.

    Returns:
    pd.DataFrame: A new DataFrame containing matched rows from df and dff.
    """
    result_rows = []
    non_fixation_row = []

    dff["section id fixation"] = dff["section id"]
    dff["recording id fixation"] = dff["recording id"]
    dff = dff.drop(columns=["section id", "recording id"])

    for index, row in df.iterrows():
        time = row[
            "timestamp [ns]"
        ]  # Assuming 'time' column in df corresponds to start time.

        matching_rows = dff[
            (dff["start timestamp [ns]"] <= time) & (dff["end timestamp [ns]"] >= time)
        ]

        if not matching_rows.empty:
            matching_row = matching_rows.iloc[0]
            combined_row = pd.concat([row, matching_row])
            result_rows.append(combined_row)
            if matching_rows.shape[0] > 1:
                print(f"Warning: Multiple matching rows found for time {time}.")
        else:
            non_fixation_row.append(row)

    if result_rows:
        result_df = pd.concat(result_rows, axis=1).T
    else:
        result_df = pd.DataFrame(columns=df.columns.tolist() + dff.columns.tolist())

    return result_df, non_fixation_row


def combine_sort_csv_files(folder_path, output_file=None):
    # List to hold data from all CSV files
    dfs = []

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Construct full file path
            file_path = os.path.join(folder_path, filename)
            # Read the CSV file and append to list
            df = pd.read_csv(file_path)
            dfs.append(df)

    # Concatenate all DataFrames in the list
    label_df = pd.concat(dfs, ignore_index=True)
    # Sort by 'frame_nr' column
    sorted_df = label_df.sort_values(by="frame_nr", ignore_index=True)
    # Reset index
    sorted_df.reset_index(drop=True, inplace=True).drop_duplicates()
    try:
        sorted_df.drop(columns=["Unnamed: 0"], inplace=True)
    except:
        pass

    # Write the sorted DataFrame to a new CSV file
    if output_file != None:
        sorted_df.to_csv(output_file, index=False)

    return sorted_df


def load_merge_csv(session_name, folder_path):
    """
    Load the three CSV files and merge them together.

    Parameters:
    session_name (str): Name of the session.

    Returns:
    pd.DataFrame: A new DataFrame containing the merged data.
    """
    fix_path = folder_path + session_name + "/fixations.csv"
    world_path = folder_path + session_name + "/world_timestamps.csv"
    labels_certain_path = folder_path + session_name + "/all_labels_newseg.csv"

    fixations_df = pd.read_csv(fix_path)
    world_timestamps_df = pd.read_csv(world_path)
    label_df = pd.read_csv(labels_certain_path)
    world_timestamps_df["frame_nr"] = world_timestamps_df.index

    # megere the frames
    matched_df_A, b = match_worldtime_label(world_timestamps_df, label_df)
    final_matched_df, non_fix_row = match_fixation(matched_df_A, fixations_df)

    return final_matched_df, non_fix_row, world_timestamps_df, fixations_df, label_df


def cut_df_to_session(df, start_frame, end_frame):
    """
    Cut the dataframe to the session start and end frame.

    Parameters:
    df (pd.DataFrame): DataFrame to cut.
    start_frame (int): Start frame of the session.
    end_frame (int): End frame of the session.

    Returns:
    pd.DataFrame: A new DataFrame containing only the frames from start to end.

    """
    df2 = df[(df["frame_nr"] >= start_frame) & (df["frame_nr"] <= end_frame)]
    return df2


def check_matching(
    label_df,
    final_matched_df,
    final_matched_insession_df,
    non_fix_row,
    fixations_df,
    start_frame,
    end_frame,
):
    """
    Check if the matching was done correctly.
    """
    if len(label_df) > len(final_matched_df) + len(non_fix_row):
        print("     Error: Some rows were lost during matching.")
    if len(label_df) + len(non_fix_row) < len(final_matched_df):
        print("     Error: Some rows were duplicated during matching.")
    if (
        final_matched_insession_df.iloc[0]["section id"]
        != final_matched_insession_df.iloc[0]["section id fixation"]
    ):
        print("     Error: The section ids do not match.")
    if final_matched_insession_df["fixation id"].nunique() < (
        ((end_frame - start_frame) / 30) * 3
    ):
        fixations_df = fixations_df[
            (
                fixations_df["end timestamp [ns]"]
                >= final_matched_insession_df["timestamp [ns]"][0]
            )
            & (
                fixations_df["start timestamp [ns]"]
                <= final_matched_insession_df["timestamp [ns]"].iloc[-1]
            )
        ]
        fix_in_session = fixations_df["fixation id"].nunique()
        print(f"    There are {fix_in_session} fixation events")
        print(
            f"    With {(end_frame - start_frame)} frames, 30 fps and three fixations/sec on average, there should be at least {(end_frame-start_frame)/30 *3} fixations."
        )
    if (
        abs(final_matched_insession_df.iloc[0]["frame_nr"] - start_frame) > 10
        or abs(final_matched_insession_df.iloc[-1]["frame_nr"] - end_frame) > 10
    ):
        print(
            "     start: ",
            final_matched_insession_df.iloc[0]["frame_nr"],
            " end: ",
            final_matched_insession_df.iloc[-1]["frame_nr"],
            " should be: ",
            start_frame,
            end_frame,
        )

    print(
        "    Finished checking matching(checked for duplicated or lost rows, wrong section ids, fixation ammount, start/endframe)."
    )

###  for all sessions create the fixation_and_labels files (all gaza datapoints with their labels and fixations)

In [None]:
# for all sessions
start_end_path = (
    "/start_end_frames.csv"  # add the path to the file with the start and end frames
)
folder_path = "/data/"  # path to the folder with the data
start_end = pd.read_csv(start_end_path)
for i, (index, row) in enumerate(start_end.iterrows()):
    if True:
        session_name = row["session"]
        start_frame = row["start"]
        end_frame = row["end"]

        fixation_and_labels = (
            folder_path + session_name + "/fixation_and_labels.csv"
        )  # might need to be changed according to the folder structure

        print(f"Processing session {session_name}...")

        final_matched_df, non_fix_row, world_timestamps_df, fixations_df, label_df = (
            load_merge_csv(session_name=session_name, folder_path=folder_path)
        )
        final_matched_insession_df = cut_df_to_session(
            final_matched_df, start_frame, end_frame
        ).reset_index(drop=True)

        # Check if everything is correct
        # print the ammount of rows which are not in the experiment session
        print(
            f"    Deleted {len(final_matched_insession_df) - len(final_matched_insession_df)} rows which are not in experiment session."
        )
        check_matching(
            label_df,
            final_matched_df,
            final_matched_insession_df,
            non_fix_row,
            fixations_df,
            start_frame,
            end_frame,
        )

        # final correcion of df
        try:
            final_matched_insession_df.drop(
                columns=["section id fixation", "recording id fixation"], inplace=True
            )
        except:
            pass
        try:
            final_matched_insession_df.drop(columns=["Unnamed: 0"], inplace=True)
        except:
            pass

        # store the final df
        final_matched_insession_df.to_csv(fixation_and_labels)
        print(len(final_matched_insession_df), len(label_df))