In [3]:
# 01_preprocessing_eye.ipynb
# IITB EdTech 2025 - Problem ID 7
# STEP 1: Understand and Prepare the Data (100% Robust)

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import cv2
import os
from pathlib import Path
import pickle
import logging
from tqdm.notebook import tqdm

# === SETUP ===
DATASET_ROOT = Path("/content/drive/MyDrive/STData/STData")
RESULTS_ROOT = Path("/content/results")
RESULTS_ROOT.mkdir(exist_ok=True)
(PREPROC := RESULTS_ROOT / "preprocessed").mkdir(exist_ok=True)
(LOGS := RESULTS_ROOT / "logs").mkdir(exist_ok=True)

logging.basicConfig(
    filename=LOGS / "step1_preprocessing.log",
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(message)s'
)
log = logging.getLogger()

# === COLUMN MAPPING FUNCTION ===
def map_columns(df, candidates, name):
    df_cols = [c.lower() for c in df.columns]
    for cand in candidates:
        if cand.lower() in df_cols:
            col = df.columns[df_cols.index(cand.lower())]
            log.info(f"Column '{name}' → '{col}'")
            return col
    log.warning(f"Column '{name}' NOT FOUND. Tried: {candidates}")
    return None

# === LOAD & MAP PARTICIPANT ===
def load_participant(pid):
    folder = DATASET_ROOT / str(pid)
    if not folder.exists():
        log.warning(f"P{pid}: Folder missing")
        return None

    eye_path = folder / f"{pid}_EYE.csv"
    psy_path = folder / f"{pid}_PSY.csv"

    if not eye_path.exists() or not psy_path.exists():
        log.warning(f"P{pid}: Missing EYE or PSY file")
        return None

    eye = pd.read_csv(eye_path, low_memory=False)
    psy = pd.read_csv(psy_path)

    # --- Map EYE columns ---
    x_col = map_columns(eye, ['x', 'GazePointX', 'FPOGX', 'MappedFixationPointX'], 'x')
    y_col = map_columns(eye, ['y', 'GazePointY', 'FPOGY', 'MappedFixationPointY'], 'y')
    start_col = map_columns(eye, ['start', 'FixationStart', 'time', 'Start'], 'start')
    end_col = map_columns(eye, ['end', 'FixationEnd', 'End'], 'end')
    dur_col = map_columns(eye, ['duration', 'FixationDuration', 'Duration'], 'duration')

    if not all([x_col, y_col, start_col, end_col, dur_col]):
        log.error(f"P{pid}: Missing critical EYE columns")
        return None

    eye = eye.rename(columns={
        x_col: 'x', y_col: 'y',
        start_col: 'start', end_col: 'end',
        dur_col: 'duration'
    })[['x', 'y', 'start', 'end', 'duration']]

    # --- Map PSY columns ---
    qid_col = map_columns(psy, ['question_id', 'trial', 'qid', 'QuestionID'], 'question_id')
    diff_col = map_columns(psy, ['difficulty', 'task_type', 'level', 'Difficulty'], 'difficulty')
    rstart_col = map_columns(psy, ['routineStart', 'trial_start', 'RoutineStart', 'start_time'], 'routineStart')
    rend_col = map_columns(psy, ['routineEnd', 'trial_end', 'RoutineEnd', 'end_time'], 'routineEnd')
    stim_col = map_columns(psy, ['stimulus_path', 'image', 'stimulus', 'ImagePath'], 'stimulus_path')

    if not all([qid_col, diff_col, rstart_col, rend_col, stim_col]):
        log.error(f"P{pid}: Missing critical PSY columns")
        return None

    psy = psy.rename(columns={
        qid_col: 'question_id',
        diff_col: 'difficulty',
        rstart_col: 'routineStart',
        rend_col: 'routineEnd',
        stim_col: 'stimulus_path'
    })[['question_id', 'difficulty', 'routineStart', 'routineEnd', 'stimulus_path']]

    log.info(f"P{pid}: EYE={eye.shape}, PSY={psy.shape}")
    return {"EYE": eye, "PSY": psy, "folder": folder}

# === PREPROCESSING ===
def get_image_size(path):
    try:
        img = cv2.imread(str(path))
        return img.shape[1], img.shape[0]  # W, H
    except:
        return None, None

def normalize_gaze(eye_df, w, h, screen_w=1920, screen_h=1080):
    df = eye_df.copy()
    df['x_norm'] = df['x'] * w / screen_w
    df['y_norm'] = df['y'] * h / screen_h
    df['x_norm'] = df['x_norm'].clip(0, w-1)
    df['y_norm'] = df['y_norm'].clip(0, h-1)
    return df

def zscore_durations(eye_df):
    mean_dur = eye_df['duration'].mean()
    std_dur = eye_df['duration'].std()
    eye_df['duration_z'] = (eye_df['duration'] - mean_dur) / (std_dur + 1e-8)
    return eye_df

# === MAIN LOOP ===
all_trials = []
metadata = []
participant_summary = []

for pid in tqdm(range(1, 39), desc="Participants"):
    data = load_participant(pid)
    if not data: continue

    eye = zscore_durations(data["EYE"])
    psy = data["PSY"]
    folder = data["folder"]

    p_trials = 0
    diff_count = {"easy": 0, "medium": 0, "hard": 0}
    total_fix = 0

    for _, row in psy.iterrows():
        qid = row['question_id']
        diff = str(row['difficulty']).lower()
        if diff not in ["easy", "medium", "hard"]:
            continue

        start, end = row['routineStart'], row['routineEnd']
        stim_path = folder / row['stimulus_path']
        if not stim_path.exists():
            log.warning(f"P{pid} {qid}: Stimulus missing → {stim_path}")
            continue

        fixations = eye[(eye['start'] >= start) & (eye['end'] <= end)]
        if len(fixations) < 3:
            continue

        w, h = get_image_size(stim_path)
        if w is None:
            continue

        fixations = normalize_gaze(fixations, w, h)
        fixations['pid'] = pid
        fixations['qid'] = qid
        fixations['difficulty'] = diff
        fixations['stimulus_path'] = str(stim_path)

        all_trials.append(fixations)
        metadata.append({
            "pid": pid, "qid": qid, "difficulty": diff,
            "stimulus_path": str(stim_path),
            "img_w": w, "img_h": h,
            "routineStart": start, "routineEnd": end,
            "n_fixations": len(fixations)
        })

        p_trials += 1
        diff_count[diff] += 1
        total_fix += len(fixations)

    participant_summary.append({
        "pid": pid,
        "n_questions": p_trials,
        "n_easy": diff_count["easy"],
        "n_medium": diff_count["medium"],
        "n_hard": diff_count["hard"],
        "total_fixations": total_fix
    })

# === SAVE OUTPUTS ===
if all_trials:
    all_fix_df = pd.concat(all_trials, ignore_index=True)
    all_fix_df.to_csv(RESULTS_ROOT / "trial_fixations.csv", index=False)
    pd.DataFrame(metadata).to_csv(RESULTS_ROOT / "stimulus_metadata.csv", index=False)
    pd.DataFrame(participant_summary).to_csv(RESULTS_ROOT / "participant_summary.csv", index=False)

    # Save per-participant fixations
    for pid in range(1, 39):
        pdf = all_fix_df[all_fix_df['pid'] == pid]
        if not pdf.empty:
            with open(PREPROC / f"P{pid}_fixations.pkl", "wb") as f:
                pickle.dump(pdf, f)

    log.info(f"STEP 1 COMPLETE: {len(all_trials)} trials, {len(all_fix_df)} fixations")
    print(f"STEP 1 DONE → {len(all_trials)} trials saved")
else:
    log.error("No valid trials found")
    print("No data processed")

print("Check results/ and logs/step1_preprocessing.log")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Participants:   0%|          | 0/38 [00:00<?, ?it/s]

ERROR:root:P1: Missing critical EYE columns
ERROR:root:P2: Missing critical EYE columns
ERROR:root:P3: Missing critical EYE columns
ERROR:root:P4: Missing critical EYE columns
ERROR:root:P5: Missing critical EYE columns
ERROR:root:P6: Missing critical EYE columns
ERROR:root:P7: Missing critical EYE columns
ERROR:root:P8: Missing critical EYE columns
ERROR:root:P9: Missing critical EYE columns
ERROR:root:P10: Missing critical EYE columns
ERROR:root:P11: Missing critical EYE columns
ERROR:root:P12: Missing critical EYE columns
ERROR:root:P13: Missing critical EYE columns
ERROR:root:P14: Missing critical EYE columns
ERROR:root:P15: Missing critical EYE columns
ERROR:root:P16: Missing critical EYE columns
ERROR:root:P17: Missing critical EYE columns
ERROR:root:P18: Missing critical EYE columns
ERROR:root:P19: Missing critical EYE columns
ERROR:root:P20: Missing critical EYE columns
ERROR:root:P21: Missing critical EYE columns
ERROR:root:P22: Missing critical EYE columns
ERROR:root:P23: Mis

No data processed
Check results/ and logs/step1_preprocessing.log
