# Attention Seeker – Data Analysis

This notebook implements the core ML pipeline for the **Attention Seeker** project:

1. Load the CogLoad1 `merged_sensors.csv` dataset.
2. Preprocess the wearable sensor data.
3. Compute per-window features and an **Attention Score** based on HR, HRV, and movement.
4. Simulate an **Outside Factors** score (sleep + screen time) as described in the proposal.
5. Compute the Pearson correlation between Attention Score and Outside Factors.
6. Train simple regression and classification models.
7. Save a processed CSV and generate basic plots.


In [None]:
# Configuration and imports
import os
from typing import Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    accuracy_score,
    f1_score,
    roc_auc_score,
)

# Adjust these paths before running
DATA_PATH = "path/to/merged_sensors.csv"  # e.g., './CogLoad1/train/raw/merged_sensors.csv'
OUTPUT_DIR = "./attention_output"  # where outputs (csv, plots, report) will be stored

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Column names (change if your CSV columns differ)
TIME_COL = "timestamp"       # or 'time'
HR_COL = "hr"                # heart rate
ACC_X_COL = "acc_x"
ACC_Y_COL = "acc_y"
ACC_Z_COL = "acc_z"
LEVEL_COL = "level"          # cognitive load / task level
SUBJECT_COL = "person"       # or 'user_id', etc. (optional)


In [None]:
# Helper functions: load, preprocess, windowing, scoring

def load_data(path: str) -> pd.DataFrame:
    """Load the merged sensor CSV."""
    df = pd.read_csv(path)

    # Handle timestamp column name variations
    if TIME_COL not in df.columns:
        for alt in ["time", "Time", "timestamp"]:
            if alt in df.columns:
                df.rename(columns={alt: TIME_COL}, inplace=True)
                break

    if TIME_COL not in df.columns:
        raise ValueError(f"Could not find a time/timestamp column in {path}.")

    return df


def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """Basic preprocessing: timestamps, sort, movement, approximate HRV."""
    df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")
    df = df.dropna(subset=[TIME_COL])
    df = df.sort_values(TIME_COL)

    # Drop rows with missing HR or accelerometer
    required_cols = [HR_COL, ACC_X_COL, ACC_Y_COL, ACC_Z_COL]
    df = df.dropna(subset=[c for c in required_cols if c in df.columns])

    # Movement magnitude
    df["movement"] = np.sqrt(
        df[ACC_X_COL] ** 2 +
        df[ACC_Y_COL] ** 2 +
        df[ACC_Z_COL] ** 2
    )

    # Approximate HRV from absolute HR differences
    df["hr_diff"] = df[HR_COL].diff().abs()
    return df


def compute_baselines(df: pd.DataFrame) -> Tuple[float, float, float]:
    """Compute global baseline values for HR, HRV (approx), and movement."""
    hr_rest = df[HR_COL].median()
    hrv_rest = df["hr_diff"].median()
    movement_rest = df["movement"].median()

    eps = 1e-6
    hr_rest = max(hr_rest, eps)
    hrv_rest = max(hrv_rest, eps)
    movement_rest = max(movement_rest, eps)

    return hr_rest, hrv_rest, movement_rest


def window_data(df: pd.DataFrame, window_seconds: int = 30) -> pd.DataFrame:
    """Aggregate data into fixed-size time windows (e.g., 30 seconds)."""
    t0 = df[TIME_COL].min()
    seconds_since_start = (df[TIME_COL] - t0).dt.total_seconds()
    df["window_id"] = (seconds_since_start // window_seconds).astype(int)

    agg_dict = {
        HR_COL: "mean",
        "hr_diff": "mean",
        "movement": "mean",
    }
    if LEVEL_COL in df.columns:
        agg_dict[LEVEL_COL] = "median"

    windowed = df.groupby("window_id").agg(agg_dict).reset_index()
    windowed = windowed.dropna(subset=[HR_COL, "hr_diff", "movement"])
    return windowed


def compute_attention_score(
    df: pd.DataFrame,
    hr_rest: float,
    hrv_rest: float,
    movement_rest: float,
    w_hr: float = 0.25,
    w_hrv: float = 0.50,
    w_movement: float = 0.25,
) -> pd.DataFrame:
    """Compute the Attention Score using project formula."""
    eps = 1e-6

    hr_t = df[HR_COL].values
    hrv_t = df["hr_diff"].values
    m_t = df["movement"].values

    hr_term = (hr_t - hr_rest) / max(hr_rest, eps)
    hrv_term = (hrv_t - hrv_rest) / max(hrv_rest, eps)
    m_term = (movement_rest - m_t) / max(movement_rest, eps)

    attention_score = w_hr * hr_term + w_hrv * hrv_term + w_movement * m_term

    df["HR_term"] = hr_term
    df["HRV_term"] = hrv_term
    df["M_term"] = m_term
    df["AttentionScore"] = attention_score
    return df


def simulate_outside_factors(df: pd.DataFrame, random_state: int = 42) -> pd.DataFrame:
    """Simulate sleep/screen time and compute an OutsideFactors score."""
    rng = np.random.default_rng(random_state)

    n = len(df)
    sleep = rng.normal(7.5, 0.7, size=n)
    screen = rng.normal(3.5, 1.0, size=n)
    screen = np.clip(screen, 0.5, 8.0)

    baseline_sleep = 7.5
    baseline_screen = 3.5

    outside_factors = (sleep - baseline_sleep) - (screen - baseline_screen)

    df["SleepHours"] = sleep
    df["ScreenHours"] = screen
    df["OutsideFactors"] = outside_factors
    return df


def compute_correlation(df: pd.DataFrame) -> float:
    x = df["AttentionScore"].values
    y = df["OutsideFactors"].values
    if len(x) < 2:
        return float("nan")
    r = np.corrcoef(x, y)[0, 1]
    return r


In [None]:
# Helper functions: ML models, plotting, reporting

def train_models(df: pd.DataFrame):
    """Train regression and classification models on AttentionScore."""
    feature_cols = [
        HR_COL,
        "hr_diff",
        "movement",
        "SleepHours",
        "ScreenHours",
        "OutsideFactors",
    ]
    df = df.dropna(subset=feature_cols + ["AttentionScore"])

    X = df[feature_cols].values
    y_reg = df["AttentionScore"].values

    # Define lapse: Attention below some threshold
    lapse_threshold = -0.05
    y_cls = (df["AttentionScore"] < lapse_threshold).astype(int).values

    X_train, X_test, y_reg_train, y_reg_test, y_cls_train, y_cls_test = train_test_split(
        X, y_reg, y_cls, test_size=0.25, random_state=0
    )

    # Regression
    lin = LinearRegression()
    lin.fit(X_train, y_reg_train)
    y_reg_pred = lin.predict(X_test)
    r2 = r2_score(y_reg_test, y_reg_pred)
    mae = mean_absolute_error(y_reg_test, y_reg_pred)

    # Classification
    log = LogisticRegression(max_iter=1000)
    log.fit(X_train, y_cls_train)
    y_cls_pred = log.predict(X_test)
    y_cls_prob = log.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_cls_test, y_cls_pred)
    f1 = f1_score(y_cls_test, y_cls_pred)
    try:
        auc = roc_auc_score(y_cls_test, y_cls_prob)
    except ValueError:
        auc = float("nan")

    metrics = {
        "regression_r2": r2,
        "regression_mae": mae,
        "classification_accuracy": acc,
        "classification_f1": f1,
        "classification_auc": auc,
    }

    return metrics


def plot_attention_vs_outside(df: pd.DataFrame, out_path: str):
    """Create a scatter plot of AttentionScore vs OutsideFactors."""
    plt.figure(figsize=(6, 4))
    plt.scatter(df["OutsideFactors"], df["AttentionScore"], alpha=0.5)
    plt.axhline(0.0, color="gray", linestyle="--", linewidth=1)
    plt.axvline(0.0, color="gray", linestyle="--", linewidth=1)
    plt.xlabel("Outside Factors Score")
    plt.ylabel("Attention Score")
    plt.title("Attention Score vs Outside Factors")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()


def save_report(out_dir: str, correlation_r: float, metrics: dict):
    """Write a simple text report summarizing correlation and model metrics."""
    report_path = os.path.join(out_dir, "report.txt")
    with open(report_path, "w", encoding="utf-8") as f:
        f.write("Attention Seeker – ML Summary\n")
        f.write("=================================\n\n")
        f.write(f"Pearson correlation (AttentionScore vs OutsideFactors): {correlation_r:.3f}\n\n")
        f.write("Regression (predict AttentionScore):\n")
        f.write(f"  R^2 : {metrics['regression_r2']:.3f}\n")
        f.write(f"  MAE : {metrics['regression_mae']:.3f}\n\n")
        f.write("Classification (predict low-attention lapse):\n")
        f.write(f"  Accuracy : {metrics['classification_accuracy']:.3f}\n")
        f.write(f"  F1       : {metrics['classification_f1']:.3f}\n")
        f.write(f"  AUC      : {metrics['classification_auc']:.3f}\n")
    print(f"[INFO] Wrote report to {report_path}")


In [None]:
# Main pipeline: run analysis and generate outputs

print(f"[INFO] Loading data from {DATA_PATH} ...")
df_raw = load_data(DATA_PATH)

print("[INFO] Preprocessing sensor data ...")
df_prep = preprocess(df_raw)

print("[INFO] Computing baselines ...")
hr_rest, hrv_rest, movement_rest = compute_baselines(df_prep)
print(f"  HR_rest        = {hr_rest:.3f}")
print(f"  HRV_rest       = {hrv_rest:.3f}")
print(f"  Movement_rest  = {movement_rest:.3f}")

print("[INFO] Windowing data into 30-second windows ...")
df_win = window_data(df_prep, window_seconds=30)

print("[INFO] Computing Attention Score ...")
df_win = compute_attention_score(df_win, hr_rest, hrv_rest, movement_rest)

print("[INFO] Simulating Outside Factors ...")
df_win = simulate_outside_factors(df_win)

# Save processed CSV
csv_path = os.path.join(OUTPUT_DIR, "attention_scores.csv")
df_win.to_csv(csv_path, index=False)
print(f"[INFO] Saved attention_scores.csv to: {csv_path}")

# Correlation
print("[INFO] Computing correlation ...")
r = compute_correlation(df_win)
print(f"[RESULT] Pearson r(AttentionScore, OutsideFactors) = {r:.3f}")

# ML models
print("[INFO] Training ML models ...")
metrics = train_models(df_win)
print("[RESULT] Regression R^2:", metrics["regression_r2"])
print("[RESULT] Regression MAE:", metrics["regression_mae"])
print("[RESULT] Classification Accuracy:", metrics["classification_accuracy"])
print("[RESULT] Classification F1:", metrics["classification_f1"])
print("[RESULT] Classification AUC:", metrics["classification_auc"])

# Plot
print("[INFO] Plotting AttentionScore vs OutsideFactors ...")
scatter_path = os.path.join(OUTPUT_DIR, "attention_scatter.png")
plot_attention_vs_outside(df_win, scatter_path)
print(f"[INFO] Saved scatter plot to: {scatter_path}")

# Report
save_report(OUTPUT_DIR, r, metrics)

print("[INFO] Done.")
