In [7]:
import pandas as pd
import numpy as np


### Data Loading

In [19]:
df = pd.read_csv('merged_feature_dataset_PRCD-48C18D.csv')
df.isnull().sum()
df

Unnamed: 0,Participant Id,Condition,Timestamp,Name,Event Details
0,PRCD-48C18D,T,2025-03-04 10:37:00,Session,Word Unscramble Task
1,PRCD-48C18D,T,2025-03-04 10:37:00,Session,
2,PRCD-48C18D,T,2025-03-04 10:37:00,Session,2025-03-04 10
3,PRCD-48C18D,T,2025-03-04 10:37:00,Mouse,
4,PRCD-48C18D,T,2025-03-04 10:37:00,Mouse,
...,...,...,...,...,...
775,PRCD-48C18D,T,2025-03-04 10:37:10,Mouse,"Delta (0, 0)"
776,PRCD-48C18D,T,2025-03-04 10:37:10,Mouse,"Delta (0, -1)"
777,PRCD-48C18D,T,2025-03-04 10:37:10,Mouse,"Delta (0, -1)"
778,PRCD-48C18D,T,2025-03-04 10:37:10,Session,


### Time Stamp Conversion
1. Converted the gaze data in % per minute to match the existing model

In [25]:
import pandas as pd
import numpy as np
import ast
from math import sqrt

# Function to safely parse coordinates
def parse_coordinates(value):
    try:
        # Check if the value is a string and not 'nan' or empty
        if isinstance(value, str) and value.strip().lower() not in ['nan', 'null', '']:
            return ast.literal_eval(value.strip())
        else:
            return np.nan
    except (ValueError, SyntaxError):
        # Handle cases where the string cannot be parsed
        return np.nan

# Function to calculate Euclidean distance
def calculate_distance(x1, y1, x2, y2):
    return sqrt((x2 - x1)**2 + (y2 - y1)**2)

# Load the CSV data

# Clean the Timestamp column
df = df[df['Timestamp'] != ""]
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df = df.dropna(subset=['Timestamp', 'Event Details'])

# Round timestamps to the nearest minute
df['Timestamp'] = df['Timestamp'].dt.floor('T')  # 'T' = minute level

# Create a DataFrame with all unique timestamps, conditions, and participant IDs
all_timestamps = df[['Timestamp', 'Condition', 'Participant Id']].drop_duplicates()

# Process gaze data
gaze_data = df[df['Name'] == 'Gaze'][['Timestamp', 'Event Details', 'Condition', 'Participant Id']].copy()
gaze_data['Event Details'] = gaze_data['Event Details'].str.strip()
gaze_data = gaze_data.dropna(subset=['Event Details']).copy()

# Calculate gaze frequencies
gaze_frequency = gaze_data.groupby(['Timestamp', 'Condition', 'Participant Id']).agg(
    gaze_left_percent=('Event Details', lambda x: (x == 'Left').sum() / len(x) * 100 if len(x) > 0 else 0),
    gaze_right_percent=('Event Details', lambda x: (x == 'Right').sum() / len(x) * 100 if len(x) > 0 else 0),
    gaze_forward_percent=('Event Details', lambda x: (x == 'Center').sum() / len(x) * 100 if len(x) > 0 else 0),
).reset_index()

# Process mouse data
mouse_data = df[df['Name'] == 'Mouse'][['Timestamp', 'Event Details', 'Condition', 'Participant Id']].copy()
mouse_data['Event Details'] = mouse_data['Event Details'].apply(parse_coordinates)
mouse_data = mouse_data.dropna(subset=['Event Details']).copy()

# Calculate mouse distance per minute
mouse_distance = mouse_data.groupby(['Timestamp', 'Condition', 'Participant Id']).apply(
    lambda group: sum(
        calculate_distance(
            group['Event Details'].iloc[i][0], group['Event Details'].iloc[i][1],
            group['Event Details'].iloc[i+1][0], group['Event Details'].iloc[i+1][1]
        )
        for i in range(len(group) - 1)
    )
).reset_index(name='mouse_distance')

# Process keyboard data
keyboard_data = df[df['Name'] == 'Keyboard'][['Timestamp', 'Event Details', 'Condition', 'Participant Id']].copy()

# Count keyboard events per minute
keyboard_count = keyboard_data.groupby(['Timestamp', 'Condition', 'Participant Id']).size().reset_index(name='keyboard_events')

# Merge all data
final_data = pd.merge(all_timestamps, gaze_frequency, on=['Timestamp', 'Condition', 'Participant Id'], how='left')
final_data = pd.merge(final_data, mouse_distance, on=['Timestamp', 'Condition', 'Participant Id'], how='left')
final_data = pd.merge(final_data, keyboard_count, on=['Timestamp', 'Condition', 'Participant Id'], how='left')

# Fill NaN values with 0
final_data['gaze_left_percent'] = final_data['gaze_left_percent'].fillna(0)
final_data['gaze_right_percent'] = final_data['gaze_right_percent'].fillna(0)
final_data['gaze_forward_percent'] = final_data['gaze_forward_percent'].fillna(0)
final_data['mouse_distance'] = final_data['mouse_distance'].fillna(0)
final_data['keyboard_events'] = final_data['keyboard_events'].fillna(0)

# # Save the processed data to a CSV file
# final_data.to_csv('processed_data_per_minute.csv', index=False)

print("Final DataFrame (Per Minute):\n", final_data.head())

Final DataFrame (Per Minute):
             Timestamp Condition Participant Id  gaze_left_percent  \
0 2025-03-04 10:37:00         T    PRCD-48C18D                  0   

   gaze_right_percent  gaze_forward_percent  mouse_distance  keyboard_events  
0                   0                     0    10420.564381               16  


  df['Timestamp'] = df['Timestamp'].dt.floor('T')  # 'T' = minute level
  mouse_distance = mouse_data.groupby(['Timestamp', 'Condition', 'Participant Id']).apply(
  final_data['gaze_left_percent'] = final_data['gaze_left_percent'].fillna(0)
  final_data['gaze_right_percent'] = final_data['gaze_right_percent'].fillna(0)
  final_data['gaze_forward_percent'] = final_data['gaze_forward_percent'].fillna(0)


In [26]:
final_data

Unnamed: 0,Timestamp,Condition,Participant Id,gaze_left_percent,gaze_right_percent,gaze_forward_percent,mouse_distance,keyboard_events
0,2025-03-04 10:37:00,T,PRCD-48C18D,0,0,0,10420.564381,16


### POC

In [13]:

# Clean the Timestamp column

# Drop rows with empty timestamps
df = df[df['Timestamp'] != ""]

# Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Check for null values (NaT)
null_count = df['Timestamp'].isna().sum()
print(f"Number of rows with invalid timestamps: {null_count}")

df = df.dropna(subset=['Timestamp','Event Details'])

# Verify the cleaned DataFrame
print("Cleaned DataFrame:\n", df.head())

#Filter only 'Gaze' rows and keep relevant columns
gaze_data = df[df['Name'] == 'Gaze'][['Timestamp', 'Event Details', 'Condition', 'Participant Id']].copy()
gaze_data['Event Details'] = gaze_data['Event Details'].str.strip()
gaze_data = gaze_data.dropna(subset=['Event Details']).copy()

# Round timestamps to the nearest minute
gaze_data['Timestamp'] = gaze_data['Timestamp'].dt.floor('T')  # 'T' = minute level

# Calculate gaze frequencies while keeping Timestamp, Condition, and Participant ID
gaze_frequency = gaze_data.groupby(['Timestamp', 'Condition', 'Participant Id']).agg(
    gaze_left_percent=('Event Details', lambda x: (x == 'Left').sum() / len(x) * 100 if len(x) > 0 else np.nan),
    gaze_right_percent=('Event Details', lambda x: (x == 'Right').sum() / len(x) * 100 if len(x) > 0 else np.nan),
    gaze_forward_percent=('Event Details', lambda x: (x == 'Center').sum() / len(x) * 100 if len(x) > 0 else np.nan),
).reset_index()

print("Before dropping NaNs:", len(gaze_frequency))

# Drop NaN values after computing gaze frequencies
gaze_frequency_cleaned = gaze_frequency.dropna().copy()

print("After dropping NaNs:", len(gaze_frequency_cleaned))

#Save the processed gaze data



gaze_frequency_cleaned

  mouse_data['Timestamp'] = mouse_data['Timestamp'].dt.floor('T')
  mouse_distance = mouse_data.groupby(['Timestamp', 'Condition', 'Participant Id']).apply(
  keyboard_data['Timestamp'] = keyboard_data['Timestamp'].dt.floor('T')  # Round to nearest minute
  all_timestamps['Timestamp'] = all_timestamps['Timestamp'].dt.floor('T')


Unnamed: 0,Timestamp,Condition,Participant Id,mouse_distance,keyboard_events
0,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16
1,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16
2,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16
3,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16
4,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16
5,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16
6,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16
7,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16
8,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16
9,2025-03-04 10:37:00,T,PRCD-48C18D,10420.564381,16


In [130]:
df

Unnamed: 0,Participant Id,Condition,Timestamp,Name,Event Details
0,PRCD-5A6638,T,2025-02-26 12:22:30,Session,Task 1Word Unscramble Task
1,PRCD-5A6638,T,2025-02-26 12:22:30,Session,New Session Start
2,PRCD-5A6638,T,2025-02-26 12:22:30,Session,Session Start Time2025-02-26 12:22:30
3,PRCD-5A6638,T,2025-02-26 12:22:33,Gaze,Left
4,PRCD-5A6638,T,2025-02-26 12:22:33,Head Pose,Neutral and Looking Right
...,...,...,...,...,...
8244,PRCD-5A6638,R,2025-02-26 12:36:26,Head Pose,Looking Down and Looking Right
8245,PRCD-5A6638,R,2025-02-26 12:36:26,Head Pose,Looking Down and Looking Right
8246,PRCD-5A6638,R,2025-02-26 12:36:26,Head Pose,Looking Down and Looking Right
8247,PRCD-5A6638,R,2025-02-26 12:36:26,Head Pose,Looking Down and Looking Right


### Manual Feedback Append For Time Being

In [131]:
gaze_frequency_cleaned.loc[gaze_frequency_cleaned["Condition"] == "T", "NASA TLX Score"] = 3.5
gaze_frequency_cleaned.loc[gaze_frequency_cleaned["Condition"] == "C", "NASA TLX Score"] = 1.83
gaze_frequency_cleaned.loc[gaze_frequency_cleaned["Condition"] == "R", "NASA TLX Score"] = 1.83
gaze_frequency_cleaned.loc[gaze_frequency_cleaned["Condition"] == "I", "NASA TLX Score"] = 1.67



In [138]:
gaze_frequency_cleaned[gaze_frequency_cleaned['Condition'] == 'T'].to_csv('T.csv')



### Feedback Aggregation Not Working
