# Attention Seeker ‚Äî Full ML Pipeline with Train/Test Split

This notebook processes the wearable sensor dataset, computes the Attention Score,
simulates Outside Factors, and **exports train/test CSV files** for model training.

Includes:
- Preprocessing raw sensor data
- 30‚Äësecond windowing
- Attention Score computation
- Outside Factors simulation
- Train/Test splitting
- Saving `train.csv` and `test.csv`


In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# ==== CONFIGURE PATHS HERE ====
DATA_PATH = "data/CogLoad1/train/raw/merged_sensors.csv"   # <-- UPDATE THIS
OUTPUT_DIR = "./attention_output"          # saves train/test/results
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Column mappings
TIME_COL = "timestamp"
HR_COL = "hr"
ACC_X_COL = "acc_x"
ACC_Y_COL = "acc_y"
ACC_Z_COL = "acc_z"
LEVEL_COL = "level"


In [7]:
def load_data(path):
    df = pd.read_csv(path)
    if TIME_COL not in df.columns:
        for alt in ["time", "Time", "Timestamp"]:
            if alt in df.columns:
                df.rename(columns={alt: TIME_COL}, inplace=True)
    df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors='coerce')
    df = df.dropna(subset=[TIME_COL])
    return df

def preprocess(df):
    df = df.sort_values(TIME_COL)
    df['movement'] = np.sqrt(df[ACC_X_COL]**2 + df[ACC_Y_COL]**2 + df[ACC_Z_COL]**2)
    df['hr_diff'] = df[HR_COL].diff().abs()
    return df.dropna(subset=[HR_COL, 'movement', 'hr_diff'])

def compute_baselines(df):
    eps = 1e-6
    return (
        max(df[HR_COL].median(), eps),
        max(df['hr_diff'].median(), eps),
        max(df['movement'].median(), eps)
    )

def window_data(df, window_seconds=30):
    t0 = df[TIME_COL].min()
    df['window_id'] = ((df[TIME_COL] - t0).dt.total_seconds() // window_seconds).astype(int)
    agg = {
        HR_COL: 'mean',
        'hr_diff': 'mean',
        'movement': 'mean'
    }
    if LEVEL_COL in df.columns:
        agg[LEVEL_COL] = 'median'
    return df.groupby('window_id').agg(agg).reset_index()

def compute_attention(df, HR_rest, HRV_rest, M_rest):
    eps = 1e-6
    df['HR_term'] = (df[HR_COL] - HR_rest) / HR_rest
    df['HRV_term'] = (df['hr_diff'] - HRV_rest) / HRV_rest
    df['M_term'] = (M_rest - df['movement']) / M_rest
    df['AttentionScore'] = 0.25*df['HR_term'] + 0.50*df['HRV_term'] + 0.25*df['M_term']
    return df

def simulate_outside_factors(df):
    rng = np.random.default_rng(42)
    n = len(df)
    sleep = rng.normal(7.5, 0.7, n)
    screen = rng.normal(3.5, 1.0, n)
    screen = np.clip(screen, 0.5, 8)
    df['Sleep'] = sleep
    df['Screen'] = screen
    df['OutsideFactors'] = (sleep - 7.5) - (screen - 3.5)
    return df


In [8]:
# Standardize column names to what the pipeline expects
df = load_data(DATA_PATH)
df = df.rename(columns={
    'band_ax': 'acc_x',
    'band_ay': 'acc_y',
    'band_az': 'acc_z',
    # Just in case your pipeline expects 'eda' instead of 'gsr' later:
    # 'gsr': 'eda' 
})

# Now run the pipeline
df = preprocess(df)

# Check what the columns are actually named
print("Columns found:", df.columns.tolist())
print(df.head())

# --- INSERT THIS BEFORE RUNNING THE PIPELINE ---

# 1. Force critical columns to numeric
# 'errors=coerce' turns text like "Level 1" or "Start" into NaN
cols_to_fix = ['hr', 'hr_diff', 'movement']

# Only add 'level' if it actually exists in your dataframe
if 'level' in df.columns:
    cols_to_fix.append('level')

for col in cols_to_fix:
    if col in df.columns:
        print(f"Converting {col} to numeric...")
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 2. Drop any rows where 'hr' or 'movement' became NaN (totally unusable data)
before = len(df)
df = df.dropna(subset=['hr', 'movement'])
print(f"Dropped {before - len(df)} rows containing non-numeric data.")

# --- NOW RE-RUN YOUR PIPELINE ---
# df_w = window_data(df)
# ...

Columns found: ['datetime', 'timestamp', 'user_id', 'level', 'task', 'hr', 'gsr', 'rr', 'temperature', 'TLX_mean', 'TLX_mental_demand', 'TLX_physical_demand', 'TLX_temporal_demand', 'TLX_performance', 'TLX_effort', 'TLX_frustration', 'acc_x', 'acc_y', 'acc_z', 'opacity_median', 'opacity_std', 'dataset', 'movement', 'hr_diff']
                  datetime                     timestamp user_id level   task  \
53092  2017-08-21 11:10:37 1970-01-01 00:25:03.313836500   iz2ps     0  quest   
53093  2017-08-21 11:10:38 1970-01-01 00:25:03.313837501   iz2ps     0  quest   
53094  2017-08-21 11:10:39 1970-01-01 00:25:03.313838501   iz2ps     0  quest   
53095  2017-08-21 11:10:40 1970-01-01 00:25:03.313839488   iz2ps     0  quest   
53096  2017-08-21 11:10:41 1970-01-01 00:25:03.313840485   iz2ps     0  quest   

         hr       gsr        rr  temperature  TLX_mean  ...  TLX_effort  \
53092  70.0  0.035826  0.326309    29.120001        -1  ...          -1   
53093  70.0  0.034954  0.613904    

In [9]:
# ==== MODIFIED PIPELINE FOR YOUR LOADED DATA ====

# 1. Make sure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2. Use your EXISTING df (from previous steps)
# We skip 'load_data(DATA_PATH)' because you already have the data in memory
print(f"Starting pipeline with {len(df)} rows...")

# 3. Run the processing steps
# Note: We assume you already renamed 'band_ax' -> 'acc_x' in the previous step
# If not, uncomment the line below:
# df = df.rename(columns={'band_ax': 'acc_x', 'band_ay': 'acc_y', 'band_az': 'acc_z', 'gsr': 'eda'})

processed_df = preprocess(df)

# 4. Compute Baselines (Resting Heart Rate, etc.)
HR_rest, HRV_rest, M_rest = compute_baselines(processed_df)
print(f"Baselines -> HR: {HR_rest:.1f}, HRV: {HRV_rest:.1f}, Movement: {M_rest:.1f}")

# 5. Window the data (30-second chunks)
df_w = window_data(processed_df)

# 6. Compute Attention Score & Simulate factors
df_w = compute_attention(df_w, HR_rest, HRV_rest, M_rest)
df_w = simulate_outside_factors(df_w)

print("Processed windows shape:", df_w.shape)

# 7. Save the intermediate file
processed_path = os.path.join(OUTPUT_DIR, "attention_scores.csv")
df_w.to_csv(processed_path, index=False)
print(f"Saved attention scores to: {processed_path}")

Starting pipeline with 94331 rows...
Baselines -> HR: 73.7, HRV: 0.3, Movement: 1.0
Processed windows shape: (1, 12)
Saved attention scores to: ./attention_output/attention_scores.csv


In [10]:
# ==== FIXING THE TIMESTAMP (CORRECTED UNIT) ====

# 1. Reset the timestamp column
df['timestamp'] = pd.to_numeric(df['timestamp'])

# 2. Convert using 'ms' (Milliseconds) instead of 's'
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

print(f"‚úÖ Timestamp fixed! Range: {df['timestamp'].min()} to {df['timestamp'].max()}")

# 3. Re-run Windowing (now that time is correct)
# Using 10-second windows
df_w = window_data(df, window_seconds=10)
print(f"Windows created: {len(df_w)}")

# 4. Resume Pipeline
df_w = compute_attention(df_w, HR_rest, HRV_rest, M_rest)
df_w = simulate_outside_factors(df_w)

# 5. Final Split
feature_cols = ['hr', 'hr_diff', 'movement', 'Sleep', 'Screen', 'OutsideFactors']
target_col = 'AttentionScore'
clean_df = df_w.dropna(subset=feature_cols + [target_col])

print(f"Rows available for training: {len(clean_df)}")

if len(clean_df) > 10:
    train_df, test_df = train_test_split(clean_df, test_size=0.25, random_state=42)
    
    train_path = os.path.join(OUTPUT_DIR, 'train.csv')
    test_path = os.path.join(OUTPUT_DIR, 'test.csv')
    
    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)
    
    print("\nüéâ SUCCESS! Data is split and saved.")
    print(f"Train path: {train_path}")
    print(f"Test path: {test_path}")
else:
    print("\n‚ö†Ô∏è Still not enough data. Check the timestamp range printout above.")

‚úÖ Timestamp fixed! Range: 2017-08-21 11:10:36.500000 to 2017-10-20 12:33:31.401000
Windows created: 9494
Rows available for training: 9494

üéâ SUCCESS! Data is split and saved.
Train path: ./attention_output/train.csv
Test path: ./attention_output/test.csv


In [11]:
# ---- TRAIN/TEST SPLIT ----

# Define the features we want to use for training
feature_cols = ['hr', 'hr_diff', 'movement', 'Sleep', 'Screen', 'OutsideFactors']
target_col = 'AttentionScore'

# Clean out any rows that somehow got NaNs (rare but possible)
clean_df = df_w.dropna(subset=feature_cols + [target_col])

# Split: 75% Train, 25% Test
train_df, test_df = train_test_split(clean_df, test_size=0.25, random_state=42)

# Save the final files
train_path = os.path.join(OUTPUT_DIR, 'train.csv')
test_path = os.path.join(OUTPUT_DIR, 'test.csv')

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"SUCCESS!")
print(f"Train set: {train_df.shape} saved to {train_path}")
print(f"Test set:  {test_df.shape} saved to {test_path}")

SUCCESS!
Train set: (7120, 12) saved to ./attention_output/train.csv
Test set:  (2374, 12) saved to ./attention_output/test.csv
