## create a file containing all data from all sessions 
- df : create one file for all gaze data points
- df_fix : create one file with a row per fixation 

In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append("/src/")
from visualization.visualize import (
    print_image_with_point,
    show_complete_fixation_with_all_frames_all_gaze,
)

pd.set_option("display.max_rows", 1000)

In [11]:
folder_path = "path/to/data/"
fixation_and_labels_total = folder_path + "data/fixation_and_labels_total.csv"
fixation_label_total =  folder_path + "data/fix_only_label_total.csv"

session_names = ["Expl_1_ET_1_2023-09-05_11-56-16_ET","Expl_1_ET_2_2023-09-05_12-34-24_ET","Expl_1_ET_3_2023-09-05_13-10-01_ET","Expl_2_ET_1_2023-09-06_10-36-37_ET","Expl_2_ET_2_2023-09-06_11-08-36_ET","Expl_2_ET_3_2023-09-06_11-39-21_ET","Expl_3_ET_1_2023-09-06_13-24-43_ET","Expl_3_ET_2_2023-09-06_13-57-57_ET","Expl_3_ET_3_2023-09-06_14-28-39_ET","Expl_4_ET_1_2023-09-06_18-31-33_ET","Expl_4_ET_2_2023-09-06_18-57-24_ET","Expl_5_ET_1_2023-09-07_18-17-19_ET","Expl_5_ET_2_2023-09-07_18-48-26_ET"]
fixation_and_labels = [folder_path + session+ "/fixation_and_labels_leveled.csv" for session in session_names]
fix_path = [folder_path + session+ "/fixations.csv" for session in session_names]

In [3]:
# create one huge dataframe combining from all sessions
start_path = folder_path + "Cyprus_start_end_frames.csv"
start_end_df = pd.read_csv(start_path)

if False: # set to false to not accidentally overwrite the old file
    print("Dataframe does not exist, creating it..")
    dfs =  []
    
    # Loop through file paths, loading each
    for i, file_path in enumerate(fixation_and_labels):
        df = pd.read_csv(file_path)
        # check if start end endframes are correct
        if abs(df.iloc[0]["frame_nr"] -  start_end_df.iloc[i]["start"]) > 10 or abs(df.iloc[-1]["frame_nr"]- start_end_df.iloc[i]["end"]) > 10:
            print("Error in session: ", i, " start: ", df.iloc[0]["frame_nr"], " end: ", df.iloc[-1]["frame_nr"], " should be: ", start_end_df.iloc[i]["start"], start_end_df.iloc[i]["end"])
        else :
            print("Correct session: ", i, " start: ", df.iloc[0]["frame_nr"], " end: ", df.iloc[-1]["frame_nr"], " should be: ", start_end_df.iloc[i]["start"], start_end_df.iloc[i]["end"])
        dfs.append(df)

    # Concatenate all dataframes in the list
    df = pd.concat(dfs, ignore_index=True)
    
    # add a column with the toal duration of the session
    for session, group in df.groupby("session"):
        # caclulate the difference between the max and the min
        duration = group["timestamp_[ns]"].max() - group["timestamp_[ns]"].min()
        df.loc[df['session'] == session, 'session_duration']  = duration
    
    # rename columns
    df["fixation_sum_label"] = df["fix_annotation_max_sum"]
    df["fixation_pXc_label"] = df["fix_annotation_max_pXc"]
    df["fixation_x"] = df["fixation_x_[px]"]
    df["fixation_y"] = df["fixation_y_[px]"]
    df["start_timestamp_ns"] = df["start_timestamp_[ns]"]
    df["end_timestamp_ns"] = df["end_timestamp_[ns]"]
    df["duration_ms"] = df["duration_[ms]"]
    
    # only keep relevant columns and rearrange them
    df = df[['section_id', 'recording_id', 'timestamp_[ns]', 'frame_nr', 'session',
       'x', 'y', 
       'start_timestamp_ns', 'end_timestamp_ns', 'duration_ms',
       'fixation_x', 'fixation_y', 'azimuth_[deg]',
       'fixation_id',
      'fixation_sum_label',
       'level_sum_annotation', 'fixation_pXc_label',
       'level_pXc_annotation', 'session_duration']]
    
    
    # save the complete dataframe
    df.to_csv(fixation_and_labels_total , index=False)

Dataframe does not exist, creating it..
Correct session:  0  start:  13676  end:  32919  should be:  13670 32920
Correct session:  1  start:  9880  end:  28804  should be:  9880 28805
Correct session:  2  start:  8060  end:  29530  should be:  8060 29530
Correct session:  3  start:  11605  end:  30679  should be:  11600 30680
Correct session:  4  start:  8624  end:  27959  should be:  8620 27960
Correct session:  5  start:  12164  end:  31449  should be:  12160 31450
Correct session:  6  start:  8890  end:  28169  should be:  8890 28170
Correct session:  7  start:  8710  end:  27719  should be:  8710 27720
Correct session:  8  start:  7647  end:  26949  should be:  7640 26950
Correct session:  9  start:  8600  end:  27819  should be:  8600 27820
Correct session:  10  start:  7145  end:  26329  should be:  7145 26330
Correct session:  11  start:  10180  end:  29249  should be:  10180 29250
Correct session:  12  start:  8105  end:  27699  should be:  8105 27700


### create df_fix
This dataframe only contains one row per fixation

In [5]:
# if levelsum is LEVEL_4 replace fix_annotation_max_sum with ambiguous
df.loc[df['level_sum_annotation'] == 4, 'fixation_sum_label'] = "ambiguous"
df.loc[df['level_pXc_annotation'] == 4, 'fixation_pXc_label'] = "ambiguous"

# just take relevant columns
df_fix = df[[ 'session', 'session_duration','fixation_id',
       'start_timestamp_ns', 'end_timestamp_ns', 'duration_ms',
       'fixation_x', 'fixation_y', 
       'fixation_sum_label','fixation_pXc_label','level_sum_annotation', 'level_pXc_annotation']].copy()

# frop duplicates such that per fixation only one row remains
df_fix.drop_duplicates(inplace=True)
len(df_fix)


24365

In [6]:
# rows where fixation_sum_label is more than one word -> sanity check
df_fix.loc[df_fix['fixation_sum_label'].str.contains("{")]

Unnamed: 0,session,session_duration,fixation_id,start_timestamp_ns,end_timestamp_ns,duration_ms,fixation_x,fixation_y,fixation_sum_label,fixation_pXc_label,level_sum_annotation,level_pXc_annotation


In [12]:
# calculate outliers
def mad_outlier(data, threshold=3.5):
    """ Median Absolute deviation based outlier detection.
    Returns a booelan mask (True if z > threshold, else False)."""
    median = np.median(data)
    mad = np.median(np.abs(data - median))
    modified_z_score = 0.6745 * (data - median) / mad
    return modified_z_score > threshold

# add a column which indicates if the fixation is an outlier
df_fix.loc[:, 'is_outlier'] = mad_outlier(df_fix.duration_ms)

# save the complete dataframe
df_fix.to_csv(fixation_label_total, index=False)