# Attention Seeker — Full ML Pipeline with Train/Test Split

This notebook processes the wearable sensor dataset, computes the Attention Score,
simulates Outside Factors, and **exports train/test CSV files** for model training.

Includes:
- Preprocessing raw sensor data
- 30‑second windowing
- Attention Score computation
- Outside Factors simulation
- Train/Test splitting
- Saving `train.csv` and `test.csv`


In [ ]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# ==== CONFIGURE PATHS HERE ====
DATA_PATH = "path/to/merged_sensors.csv"   # <-- UPDATE THIS
OUTPUT_DIR = "./attention_output"          # saves train/test/results
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Column mappings
TIME_COL = "timestamp"
HR_COL = "hr"
ACC_X_COL = "acc_x"
ACC_Y_COL = "acc_y"
ACC_Z_COL = "acc_z"
LEVEL_COL = "level"


In [ ]:
def load_data(path):
    df = pd.read_csv(path)
    if TIME_COL not in df.columns:
        for alt in ["time", "Time", "Timestamp"]:
            if alt in df.columns:
                df.rename(columns={alt: TIME_COL}, inplace=True)
    df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors='coerce')
    df = df.dropna(subset=[TIME_COL])
    return df

def preprocess(df):
    df = df.sort_values(TIME_COL)
    df['movement'] = np.sqrt(df[ACC_X_COL]**2 + df[ACC_Y_COL]**2 + df[ACC_Z_COL]**2)
    df['hr_diff'] = df[HR_COL].diff().abs()
    return df.dropna(subset=[HR_COL, 'movement', 'hr_diff'])

def compute_baselines(df):
    eps = 1e-6
    return (
        max(df[HR_COL].median(), eps),
        max(df['hr_diff'].median(), eps),
        max(df['movement'].median(), eps)
    )

def window_data(df, window_seconds=30):
    t0 = df[TIME_COL].min()
    df['window_id'] = ((df[TIME_COL] - t0).dt.total_seconds() // window_seconds).astype(int)
    agg = {
        HR_COL: 'mean',
        'hr_diff': 'mean',
        'movement': 'mean'
    }
    if LEVEL_COL in df.columns:
        agg[LEVEL_COL] = 'median'
    return df.groupby('window_id').agg(agg).reset_index()

def compute_attention(df, HR_rest, HRV_rest, M_rest):
    eps = 1e-6
    df['HR_term'] = (df[HR_COL] - HR_rest) / HR_rest
    df['HRV_term'] = (df['hr_diff'] - HRV_rest) / HRV_rest
    df['M_term'] = (M_rest - df['movement']) / M_rest
    df['AttentionScore'] = 0.25*df['HR_term'] + 0.50*df['HRV_term'] + 0.25*df['M_term']
    return df

def simulate_outside_factors(df):
    rng = np.random.default_rng(42)
    n = len(df)
    sleep = rng.normal(7.5, 0.7, n)
    screen = rng.normal(3.5, 1.0, n)
    screen = np.clip(screen, 0.5, 8)
    df['Sleep'] = sleep
    df['Screen'] = screen
    df['OutsideFactors'] = (sleep - 7.5) - (screen - 3.5)
    return df


In [ ]:
# ---- RUN PIPELINE ----

df = load_data(DATA_PATH)
df = preprocess(df)
HR_rest, HRV_rest, M_rest = compute_baselines(df)
df_w = window_data(df)
df_w = compute_attention(df_w, HR_rest, HRV_rest, M_rest)
df_w = simulate_outside_factors(df_w)

print("Processed windows:", df_w.shape)

# Save processed data
processed_path = os.path.join(OUTPUT_DIR, "attention_scores.csv")
df_w.to_csv(processed_path, index=False)
processed_path

In [ ]:
# ---- TRAIN/TEST SPLIT ----

feature_cols = ['hr', 'hr_diff', 'movement', 'Sleep', 'Screen', 'OutsideFactors']
target_col = 'AttentionScore'

clean_df = df_w.dropna(subset=feature_cols + [target_col])

train_df, test_df = train_test_split(clean_df, test_size=0.25, random_state=0)

train_path = os.path.join(OUTPUT_DIR, 'train.csv')
test_path = os.path.join(OUTPUT_DIR, 'test.csv')

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

train_path, test_path