# Applying the Fixation Labeling Decision
In this notebook we calculate a final label for each fixation. The results are stored in one csv file for each session.

In [3]:
import seaborn as sns
import pandas as pd
import seaborn as sns
import json
import matplotlib.pyplot as plt
import ast
import numpy as np

pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)

In [4]:
folder_path = "/data/" # path to the folder containing the data folders per session
mapping_path = folder_path+ "/data/label_mapping.json" # path to the mapping file
start_end_path =  folder_path + "start_end_frames.csv"

In [5]:
with open(mapping_path, 'r') as json_file:
    data = json.load(json_file)
    categories = data['categories']
    category_mapping = data['category_mapping']
    level3_mapping = data['level3_mapping']
    categories_list = list(categories.values())
categories_reduced = list(set([category_mapping[c] for c in categories_list]))

LEVEL_1 : this is the only label </br>
LEVEL_2 : more than 75% indicates that category_mapping</br>
LEVEL_3 : according to the heuristics we could find a label</br>
LEVEL_4 : we couldnt find a label -> ambiguous</br>

### functions


In [24]:
# all_label_with_certainty
# Add new columns for each category, initialized with NaN or some default value

###################in class####################
def in_sublist(value_list, list_to_cat):
    value_list = list(value_list)
    for possible_label in value_list:
        try:
            memberlist = list_to_cat[possible_label]
            if all(x in memberlist for x in value_list):
                return possible_label
        except:
            pass
    return False

def convert_to_list(row):
    try:
        return ast.literal_eval(row)
    except:
        return np.nan
    
def heuristic_count(values, categories, threshold):
    sum_values = sum(values)
    results = {}
    for i, cat in enumerate(categories):
        if values[i] > (threshold * sum_values):
            # caclulate the percentage of the category
            percent = (values[i] / sum_values) * 100
            results[cat] = percent
    return results


###################direct####################

def correct_format(df):
    """brings the dataframe in the correct format"""
    df['label'] = df['label'].apply(convert_to_list)
    df['label_list'] = df['label']
    df['overlap_lbl_mask'] = df['overlap_lbl_mask'].apply(convert_to_list)
    df['overlap_list'] = df['overlap_lbl_mask']
    df['fixation_id'] = df['fixation id']

    # drop the old columns
    df = df.drop(columns=['fixation id'])
    
    if 'certainty' in df.columns:
        df['certainty'] = df['certainty'].apply(convert_to_list)
        df['certainty_list'] = df['certainty']
    return df

def delte_nan_rows(df):
    # Drop rows with NaN values in the 'overlap' columncategories
    df = df.dropna(subset=['overlap_list'])
    df = df.dropna(subset=['label_list'])
    return df

def rename_labels(df,category_mapping):
    """applys mapping of categories according to the mapping file """
    for index, row in df.iterrows():
        label_list = row['label']
        new_label_list = [category_mapping[label] for label in label_list]
        df.at[index, 'label'] = new_label_list
    return df

def write_to_long_format(df,categories):
    """this is just to make calculations easier"""
    # create new columns for each category
    for category_name in categories:
        #df[category_name] = df[category_name].astype(float)
        df[category_name] = 0.0
        if "certainty" in df.columns:
            #df[category_name +"_p*c"] = df[category_name +"_p*c"].astype(float)
            df[category_name +"_p*c"] = 0.0

    # Now, iterate over the DataFrame to populate these new columns
    for index, row in df.iterrows():
        mask_value = row['overlap_lbl_mask']  
        label_list = row['label'] 
        # create a column for each labels percentag
        for idx, label in enumerate(label_list):
            # Check if the label is in our category dictionary
            if label in categories:
                # Populate the corresponding column for this label with the mask value
                df.at[index, label] += mask_value[idx]
         
        # create a column for each labels percentag * certaintys
        if "certainty" in df.columns:
            certainty = row['certainty']
            for idx, label in enumerate(label_list):
                # Check if the label is in our category dictionary
                if label in categories:
                    # Populate the corresponding column for this label with the mask value
                    df.at[index, label + "_p*c"] += mask_value[idx] * certainty[idx]
    return df

def replace_space(df):
    # Rename the columns to replace spaces with underscores
    df.columns = df.columns.str.replace(' ', '_')
    return  df

################## labeling decision ####################

def fix_max_sum_category(df, categories, level3_mapping):
    df['fix_annotation_max_sum'] = pd.NA
    df['fix_annotation_max_sum'] = df['fix_annotation_max_sum'].astype('object')
    # calculate for each group the sum for each category an then take the max
    for id,group in df.groupby('fixation_id'):
        max_category = group[categories].sum(axis=0)
        list_of_possible_labels_fix = heuristic_count(max_category.values, max_category.index, 0.2) # This threshold can be adjusted
        if len(list_of_possible_labels_fix) < 2:
            df.loc[df['fixation_id'] == id, 'level_sum_annotation'] = 1
            df.loc[df['fixation_id'] == id, 'fix_annotation_max_sum'] = max_category.idxmax()
            
        elif max(list_of_possible_labels_fix.values()) > 65: # This threshold can be adjusted
            df.loc[df['fixation_id'] == id, 'level_sum_annotation'] = 2
            df.loc[df['fixation_id'] == id, 'fix_annotation_max_sum'] = max(list_of_possible_labels_fix, key=list_of_possible_labels_fix.get)
        else: 
            possible_value = in_sublist(list_of_possible_labels_fix.keys(), level3_mapping)
            if possible_value != False:
                df.loc[df['fixation_id'] == id, 'level_sum_annotation'] = 3
                df.loc[df['fixation_id'] == id, 'fix_annotation_max_sum'] = possible_value
            
            else: 
                df.loc[df['fixation_id'] == id, 'level_sum_annotation'] = 4
                df.loc[df['fixation_id'] == id, 'fix_annotation_max_sum'] = str(list_of_possible_labels_fix)
    return df

def fix_max_sumXcert_category(df, categories, level3_mapping):
    df['fix_annotation_max_pXc'] = pd.NA
    df['fix_annotation_max_pXc'] = df['fix_annotation_max_pXc'].astype('object')
    cats = [x + "_p*c" for x in categories ]
    # calculate for each group the sum for each category an then take the max
    for id,group in df.groupby('fixation_id'):
        max_category = group[cats].sum(axis=0)
        list_of_possible_labels_fix = heuristic_count(max_category.values, max_category.index, 0.2) # This threshold can be adjusted
        if len(list_of_possible_labels_fix) < 2:
            df.loc[df['fixation_id'] == id, 'level_pXc_annotation'] = 1
            df.loc[df['fixation_id'] == id, 'fix_annotation_max_pXc'] = max_category.idxmax()[:-4]
        elif max(list_of_possible_labels_fix.values()) > 65: # This threshold can be adjusted
            df.loc[df['fixation_id'] == id, 'level_pXc_annotation'] = 2
            df.loc[df['fixation_id'] == id, 'fix_annotation_max_pXc'] = max(list_of_possible_labels_fix, key=list_of_possible_labels_fix.get)[:-4]

        else: 
            possible_values = [x[:-4] for x in list_of_possible_labels_fix.keys() ]
            possible_value = in_sublist(possible_values, level3_mapping)
            if possible_value != False:
                df.loc[df['fixation_id'] == id, 'level_pXc_annotation'] = 3
                df.loc[df['fixation_id'] == id, 'fix_annotation_max_pXc'] = possible_value 
            else: 
                df.loc[df['fixation_id'] == id, 'level_pXc_annotation'] = 4
                df.loc[df['fixation_id'] == id, 'fix_annotation_max_pXc'] = max_category.idxmax()[:-4]
    return  df


# we dont use this function anymore
def fix_max_occ_category(df, categories):
    # calculate for each group the sum for each category an then take the max
    for id,group in df.groupby('fixation_id'):
        # count the occurance of non zero values for each category
        # count all non zero values for each category
        max_category = group[categories].astype(bool).sum(axis=0)

        # Assign this max_category to a new column for all rows in the df
        df.loc[df['fixation_id'] == id, 'fix_annotation_max_occ'] = max_category.idxmax()
    return df


def num_of_labels_per_group(group):
    group['num_of_labels_group'] = len(group['label_list'])
    return group

### Apply the fixation decision to all sessions

In [25]:
# for all sessions

start_end = pd.read_csv(start_end_path)

for i, (index, row) in enumerate(start_end.iterrows()):
    if True:
        session_name = row['session']  # Assuming 'session_name' is the column name
        print(f"Processing session {session_name}...")

        fixation_and_labels = folder_path + session_name + "/fixation_and_labels.csv"
        fixation_and_labels_extended = folder_path + session_name + "/fixation_and_labels_leveled.csv"

        label_w_fix = pd.read_csv(fixation_and_labels)
        label_w_fix.drop(columns=['Unnamed: 0'], inplace=True)

        df = label_w_fix.copy()
        df = correct_format(df)
        df = delte_nan_rows(df)
        df = rename_labels(df,category_mapping) # map categories
        df = write_to_long_format(df, categories_reduced) 
        df = replace_space(df) 
        df = fix_max_sum_category(df, categories_reduced, level3_mapping)
        # we dont use this anymore
        #df = fix_max_occ_category(df, categories_reduced, level3_mapping)
        if "certainty" in df.columns: # because some older verisions do not have this column
            df = fix_max_sumXcert_category(df, categories_reduced, level3_mapping)

        df.to_csv(fixation_and_labels_extended, index=False)
print("Done!")

Processing session Expl_1_ET_1_2023-09-05_11-56-16_ET...
Processing session Expl_1_ET_2_2023-09-05_12-34-24_ET...
Processing session Expl_1_ET_3_2023-09-05_13-10-01_ET...
Processing session Expl_2_ET_1_2023-09-06_10-36-37_ET...
Processing session Expl_2_ET_2_2023-09-06_11-08-36_ET...
Processing session Expl_2_ET_3_2023-09-06_11-39-21_ET...
Processing session Expl_3_ET_1_2023-09-06_13-24-43_ET...
Processing session Expl_3_ET_2_2023-09-06_13-57-57_ET...
Processing session Expl_3_ET_3_2023-09-06_14-28-39_ET...
Processing session Expl_4_ET_1_2023-09-06_18-31-33_ET...
Processing session Expl_4_ET_2_2023-09-06_18-57-24_ET...
Processing session Expl_5_ET_1_2023-09-07_18-17-19_ET...
Processing session Expl_5_ET_2_2023-09-07_18-48-26_ET...
Done!
