In [None]:
import os
import json
import glob
import re
import pandas as pd

SESSION_DATA_DIR = "
DATA_FINAL_DIR = ""
LOCAL_TZ = "US/Eastern"
mtrack_dir = ""


print(f"Session Data: {os.path.exists(SESSION_DATA_DIR)}")
print(f"Data Final:   {os.path.exists(DATA_FINAL_DIR)}")

In [None]:
for item in sorted(os.listdir(SESSION_DATA_DIR))[:10]:
    item_path = os.path.join(SESSION_DATA_DIR, item)
    if os.path.isdir(item_path):
        files = os.listdir(item_path)
        session_files = [f for f in files if f.startswith("session_info")]
        print(f"{item}/")
        if session_files:
            for f in session_files:
                print(f"    {f}")
        else:
            print(f"    Files: {files[:5]}")
        print()

In [None]:
# sample_path = os.path.join(
#     SESSION_DATA_DIR, "MIT008", "session_info_20251031_134457.json"
# )

# with open(sample_path, "r") as f:
#     data = json.load(f)


# print(list(data.keys()))
# print()

# if isinstance(data, dict):
#     for key in list(data.keys())[:10]:
#         val = data[key]
#         if isinstance(val, list):
#             print(f"{key}: list with {len(val)} items")
#             if len(val) > 0:
#                 print(
#                     f"    First item keys: {list(val[0].keys()) if isinstance(val[0], dict) else type(val[0])}"
#                 )
#         elif isinstance(val, dict):
#             print(f"{key}: dict with keys {list(val.keys())[:5]}")
#         else:
#             print(f"{key}: {type(val).__name__} = {str(val)[:80]}")

In [None]:
# actions_file = os.path.join(SESSION_DATA_DIR, "MIT009", "actions_20251031_151702.jsonl")

# actions = []
# with open(actions_file, "r") as f:
#     for line in f:
#         if line.strip():
#             actions.append(json.loads(line))

# # unique events 
# unique = []
# for a in actions:
#     evt = a.get("action_type")
#     if evt not in unique:
#         unique.append(evt)

# print("Unique action_types (in order):\n")
# for e in unique:
#     print(f"  {e}")

In [None]:
files = sorted(os.listdir(mtrack_dir))
print(f"Files in mtrack ({len(files)} total):\n")
for f in files[:15]:
    print(f"  {f}")

In [None]:
PHASE_DEFINITIONS = {
    "Descriptive_Stress": {
        "start": ["TASK_STARTED", "DESCRIPTIVE_TASK_STARTED"],
        "end": ["DESCRIPTIVE_COUNTDOWN_AUTO_TRANSITION"],
    },
    "Stroop_Stress": {
        "start": [
            "STROOP_VIDEO_STARTED_3_MIN",
            "STROOP_VIDEO_STARTED",
            "STROOP_TASK_STARTED",
        ],
        "end": ["STROOP_VIDEO_END_TRANSITION"],
    },
    "Math_Stress": {
        "start": ["MATH_TASK_STARTED"],
        "end": ["MATH_COUNTDOWN_AUTO_TRANSITION"],
    },
    "MollyIntervention": {
        "start": ["CONTENT_PERFORMANCE_SCREEN_DISPLAYED"],
        "end": ["CONTENT_PERFORMANCE_COMPLETED"],
    },
    "Post-Relaxation": {
        "start": ["POST_STUDY_COUNTDOWN_STARTED"],
        "end": ["POST_STUDY_COUNTDOWN_AUTO_TRANSITION"],
    },
}

In [None]:
EXPECTED_DURATIONS = {
    "Baseline": (5, 15),  # 7 min ish
    "Descriptive_Stress": (4, 7),  # 5 min
    "Stroop_Stress": (1, 5),  # 2-3 min 
    "Math_Stress": (4, 7),  # 5 min 
    "MollyIntervention": (1, 15),  
    "Post-Relaxation": (5, 12),  # 7 min ish 
}

MAX_CAPS = {
    "Post-Relaxation": 12,
}



In [None]:
xlsx_files = glob.glob(os.path.join(mtrack_dir, "MIT*_SCENARIO_*.xlsx"))
participant_ids = sorted(
    set(re.match(r"(MIT\d+)", os.path.basename(f)).group(1) for f in xlsx_files)
)

print(f"{len(participant_ids)} participants: {participant_ids}\n")

In [None]:
all_sessions = []
results = {"success": [], "failed": [], "truncated": []}

for participant_id in participant_ids:
    print(f"\n{participant_id}:")

    # ------ Load actions jsonl -----------------
    participant_folder = os.path.join(SESSION_DATA_DIR, participant_id)
    actions_files = glob.glob(os.path.join(participant_folder, "actions_*.jsonl"))

    if not actions_files:
        results["failed"].append(participant_id)
        continue

    actions = []
    with open(actions_files[0], "r") as f:
        for line in f:
            if line.strip():
                actions.append(json.loads(line))

    # -------- Load mtrack xlsx ---
    xlsx_pattern = os.path.join(mtrack_dir, f"{participant_id}_SCENARIO_*.xlsx")
    xlsx_matches = glob.glob(xlsx_pattern)

    df_xlsx = None
    if xlsx_matches:
        df_xlsx = pd.read_excel(xlsx_matches[0])
        df_xlsx["start_date"] = pd.to_datetime(df_xlsx["start_date"])
        df_xlsx["end_date"] = pd.to_datetime(df_xlsx["end_date"])
        df_xlsx["start_date"] = (
            df_xlsx["start_date"].dt.tz_localize(None).dt.tz_localize(LOCAL_TZ)
        )
        df_xlsx["end_date"] = (
            df_xlsx["end_date"].dt.tz_localize(None).dt.tz_localize(LOCAL_TZ)
        )
        df_xlsx["duration_sec"] = (
            df_xlsx["end_date"] - df_xlsx["start_date"]
        ).dt.total_seconds()

    # --- Extract phases ---
    phases = []

    # baseline
    baseline_start = None
    baseline_end = None

    for a in actions:
        if a.get("action_type") == "RELAXATION_COUNTDOWN_STARTED":
            ts = a.get("timestamp", {}).get("local")
            if ts:
                baseline_start = pd.to_datetime(ts)
                if baseline_start.tzinfo is None:
                    baseline_start = baseline_start.tz_localize(LOCAL_TZ)
            break

    if df_xlsx is not None:
        rest_rows = df_xlsx[df_xlsx["rsrc_id"] == "rest"]
        if not rest_rows.empty:
            longest_rest = rest_rows.loc[rest_rows["duration_sec"].idxmax()]
            baseline_end = longest_rest["end_date"]

    if baseline_start and baseline_end:
        duration_s = (baseline_end - baseline_start).total_seconds()
        phases.append(
            {
                "phase_name": "Baseline",
                "phase_start": baseline_start,
                "phase_end": baseline_end,
                "phase_duration_s": duration_s,
            }
        )
        print(f"baseline: {duration_s/60:.1f} min")
    else:
        print(f"baseline: NOT FOUND!!")

    for phase_name, events in PHASE_DEFINITIONS.items():
        start_action = None
        end_action = None

        for start_evt in events["start"]:
            for a in actions:
                if a.get("action_type") == start_evt:
                    start_action = a
                    break
            if start_action:
                break

        if start_action:
            start_idx = actions.index(start_action)
            for end_evt in events["end"]:
                for a in actions[start_idx:]:
                    if a.get("action_type") == end_evt:
                        end_action = a
                        break
                if end_action:
                    break

        if start_action and end_action:
            start_ts = start_action.get("timestamp", {}).get("local")
            end_ts = end_action.get("timestamp", {}).get("local")

            if start_ts and end_ts:
                start_dt = pd.to_datetime(start_ts)
                end_dt = pd.to_datetime(end_ts)
                if start_dt.tzinfo is None:
                    start_dt = start_dt.tz_localize(LOCAL_TZ)
                if end_dt.tzinfo is None:
                    end_dt = end_dt.tz_localize(LOCAL_TZ)

                duration_s = (end_dt - start_dt).total_seconds()
                duration_min = duration_s / 60

                if phase_name in MAX_CAPS and duration_min > MAX_CAPS[phase_name]:
                    original_min = duration_min
                    duration_s = MAX_CAPS[phase_name] * 60
                    end_dt = start_dt + pd.Timedelta(seconds=duration_s)
                    print(
                        f"  {phase_name}: {original_min:.1f} min -> TRUNCATED  {MAX_CAPS[phase_name]} min"
                    )
                    results["truncated"].append(
                        (participant_id, phase_name, original_min)
                    )
                else:
                    print(f"  {phase_name}: {duration_min:.1f} min")

                phases.append(
                    {
                        "phase_name": phase_name,
                        "phase_start": start_dt,
                        "phase_end": end_dt,
                        "phase_duration_s": duration_s,
                    }
                )
        else:
            print(f"  {phase_name}: NOT FOUND")

    if not phases:
        results["failed"].append(participant_id)
        continue




    phases = sorted(phases, key=lambda x: x["phase_start"])

    total_start = phases[0]["phase_start"]  # Baseline start
    total_end = phases[-1]["phase_end"]
    total_duration_s = (total_end - total_start).total_seconds()
    session_id = total_start.strftime("%Y%m%d_%H%M%S")

    rows = []
    for p in phases:
        rows.append(
            {
                "participant_id": participant_id,
                "session_id": session_id,
                "phase_name": p["phase_name"],
                "phase_start": p["phase_start"].strftime("%Y-%m-%dT%H:%M:%S.%f"),
                "phase_end": p["phase_end"].strftime("%Y-%m-%dT%H:%M:%S.%f"),
                "phase_duration_s": round(p["phase_duration_s"], 6),
                "total_duration_s": round(total_duration_s, 6),
            }
        )

    session_df = pd.DataFrame(rows)


    output_folder = os.path.join(DATA_FINAL_DIR, participant_id)
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, "session.csv")
    session_df.to_csv(output_path, index=False)

In [None]:
print(f"success:{len(results['success'])}")
print(f"failed: {len(results['failed'])}")
if results["failed"]:
    print(f"{results['failed']}")
if results["truncated"]:
    print(f"Truncated phases:")
    for pid, phase, orig in results["truncated"]:
        print(f"  {pid} {phase}: {orig:.1f} min, capped")

In [None]:
if all_sessions:
    combined_df = pd.concat(all_sessions, ignore_index=True)
    combined_path = os.path.join(DATA_FINAL_DIR, "session_all_participants.csv")
    combined_df.to_csv(combined_path, index=False)


    print(f"participants:{combined_df['participant_id'].nunique()}")

In [None]:
participants_to_update = [
    "MIT001",
    "MIT002",
    "MIT003",
    "MIT004",
    "MIT006",
    "MIT007",
    "MIT008",
    "MIT009",
    "MIT010",
]

updated = []
skipped = []

for pid in participants_to_update:
    session_path = os.path.join(DATA_FINAL_DIR, pid, "session.csv")

    if not os.path.exists(session_path):
        print(f"{pid}: session.csv not found")
        skipped.append(pid)
        continue

    df = pd.read_csv(session_path)

    # session_duration_s -> total_duration_s
    if "session_duration_s" in df.columns:
        df = df.rename(columns={"session_duration_s": "total_duration_s"})


    cols_to_drop = [c for c in ["session_start", "session_end"] if c in df.columns]
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)


    df.to_csv(session_path, index=False)
    print(f"{pid}: updated ({len(df)} rows)")
    updated.append(pid)
    
if skipped:
    print(f"  {skipped}")

In [None]:
all_session_files = glob.glob(os.path.join(DATA_FINAL_DIR, "MIT*", "session.csv"))

print(f"Found {len(all_session_files)} session.csv files\n")

all_sessions = []

for session_path in sorted(all_session_files):
    pid = os.path.basename(os.path.dirname(session_path))
    df = pd.read_csv(session_path)
    all_sessions.append(df)
    print(f"{pid}: {len(df)} phases")


combined_df = pd.concat(all_sessions, ignore_index=True)
combined_path = os.path.join(DATA_FINAL_DIR, "session_all_participants.csv")
combined_df.to_csv(combined_path, index=False)