# TS2Vec IMU-Only for Hand Washing Detection (Frozen TS2Vec — like Biobank_nohumidity)

Uses **TS2Vec** as a **frozen feature extractor** (analogous to Biobank_nohumidity's frozen HarNet): pretrain TS2Vec once on all IMU windows, then LOSO with **encode only** and train the MLP classifier per fold.

**Pipeline (aligned with Biobank_nohumidity.ipynb):**
- Same data: `data/` and `new_data/`, labels from `lables/` (no humidity).
- Same windowing: window_size=500, step_size=250; IMU (acc_x, acc_y, acc_z) only.
- **Step 1 (once):** Pretrain TS2Vec on all subjects' IMU windows; freeze.
- **Step 2 (LOSO):** Encode train/test IMU windows with frozen TS2Vec; scale, augment, SMOTETomek, train MLP (focal loss, class weights); medfilt; accuracy and F1 per subject.

Reduces runtime (~90%) vs training TS2Vec per fold; embeddings consistent across folds.

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import json
from glob import glob
from scipy.signal import medfilt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.combine import SMOTETomek

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, GaussianNoise
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from ts2vec import TS2Vec


2026-02-23 09:49:25.194238: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  if not hasattr(np, "object"):


In [3]:
base_path = "/Users/sonalimanoharan/Desktop/scientific_research/hw"
data_folders = ["data", "new_data"]
label_files = ["labels.csv", "lables_new.csv"]
save_model_path = os.path.join(base_path, "ts2vec_no_humid_saved_model")
os.makedirs(save_model_path, exist_ok=True)

window_size = 500
step_size = 250
imu_cols = ["acc_x", "acc_y", "acc_z"]
REPR_DIMS = 128
TS2VEC_EPOCHS = 50


In [4]:
def load_recs():
    all_dfs = []
    for data_folder, label_file in zip(data_folders, label_files):
        data_path = os.path.join(base_path, data_folder, "*.csv")
        label_path = os.path.join(base_path, "lables", label_file)
        for fname in glob(data_path):
            df = pd.read_csv(fname)
            subject_id_full = os.path.basename(fname).replace(".csv", "")
            all_dfs.append((fname, df, label_path, subject_id_full, data_folder))
    return all_dfs

def convert_to_binlabel(x):
    return 0 if x in ["Null", "dry"] else 1

def apply_labels(dfs):
    l_dfs = []
    for fname, df, label_path, subject_id, folder in dfs:
        label_df = pd.read_csv(label_path)
        label_df["filename"] = label_df["datetime"].apply(lambda x: os.path.basename(str(x)).strip())
        file_basename = os.path.basename(fname).strip()
        matched_row = label_df[label_df["filename"].apply(lambda x: x.endswith(file_basename))]
        if matched_row.empty:
            continue
        df = df.copy()
        df["label"] = "Null"
        label_info = json.loads(matched_row.iloc[0]["label"])
        for d in label_info:
            df.loc[d["start"]:d["end"], "label"] = d["timeserieslabels"][0]
        df["binlabel"] = df["label"].apply(convert_to_binlabel)
        df["subject"] = subject_id
        df["source_folder"] = folder
        l_dfs.append(df)
    return l_dfs

def create_windows(df, window_size, step_size):
    """Returns (X_imu_windows, labels). X_imu: (n_windows, window_size, 3)."""
    imu_list, labels_list = [], []
    for start in range(0, len(df) - window_size + 1, step_size):
        window = df.iloc[start:start + window_size]
        if not all(c in window.columns for c in imu_cols):
            continue
        imu = window[imu_cols].values.astype(np.float32)
        label_mode = window["binlabel"].mode()
        lab = label_mode.iloc[0] if not label_mode.empty else int(window["binlabel"].iloc[0])
        imu_list.append(imu)
        labels_list.append(lab)
    if not imu_list:
        return np.zeros((0, window_size, len(imu_cols)), dtype=np.float32), np.array([], dtype=np.int64)
    return np.stack(imu_list, axis=0), np.array(labels_list, dtype=np.int64)


In [5]:
def augment_data(X, y, augment_ratio=0.5):
    X_aug, y_aug = [], []
    for i in range(len(X)):
        if y[i] != 1:
            continue
        x_sample = X[i].copy()
        if np.random.rand() < augment_ratio:
            x_sample += np.random.normal(0, 0.01, size=x_sample.shape)
        X_aug.append(x_sample)
        y_aug.append(y[i])
    return np.array(X_aug), np.array(y_aug).reshape(-1, 1)

def focal_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        eps = 1e-7
        y_pred = tf.clip_by_value(y_pred, eps, 1.0 - eps)
        pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        return -tf.reduce_mean(alpha * tf.pow(1. - pt, gamma) * tf.math.log(pt))
    return loss

def build_model(input_dim):
    inp = Input(shape=(input_dim,))
    x = GaussianNoise(0.1)(inp)
    x = Dense(128, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inp, x)
    model.compile(optimizer=Adam(learning_rate=1e-4), loss=focal_loss(), metrics=["accuracy"])
    return model


In [6]:
all_dfs = load_recs()
labeled_dfs = apply_labels(all_dfs)
excluded_subjects = {
    "2025-01-18-22-38-29_37959204-490b-4cd9-b647-94e743071951",
    "2025-01-28-21-43-21_e4380fee-3c78-4e38-936f-acd60513e279"
}
filtered_dfs = [df for df in labeled_dfs if df["subject"].iloc[0] not in excluded_subjects]
subjects = sorted(set(df["subject"].iloc[0] for df in filtered_dfs))
print(f"Found {len(subjects)} subjects (after exclusions)")
results = []


Found 18 subjects (after exclusions)


In [7]:
# Build per-subject windows once; pretrain TS2Vec on all IMU data (frozen encoder)
subject_windows = {}
for df in filtered_dfs:
    sid = df["subject"].iloc[0]
    if sid not in subject_windows:
        subject_windows[sid] = {"imu": [], "y": []}
    imu_w, label = create_windows(df, window_size, step_size)
    subject_windows[sid]["imu"].append(imu_w)
    subject_windows[sid]["y"].append(label)

for sid in subject_windows:
    subject_windows[sid]["imu"] = np.concatenate(subject_windows[sid]["imu"], axis=0)
    subject_windows[sid]["y"] = np.concatenate(subject_windows[sid]["y"], axis=0)

X_all_imu = np.concatenate([subject_windows[s]["imu"] for s in subjects], axis=0)
print(f"Pretraining TS2Vec on {X_all_imu.shape[0]} IMU windows (all subjects)...")

use_gpu = bool(tf.config.list_physical_devices("GPU"))
device_ts2vec = 0 if use_gpu else 'cpu'
ts2vec_model = TS2Vec(
    input_dims=X_all_imu.shape[2],
    output_dims=REPR_DIMS,
    device=device_ts2vec,
    batch_size=32,
)
ts2vec_model.fit(X_all_imu, n_epochs=TS2VEC_EPOCHS, verbose=True)
print("TS2Vec pretrained and frozen. LOSO will use encode() only.")


Pretraining TS2Vec on 12927 IMU windows (all subjects)...
Epoch #0: loss=1540103.5995851427
Epoch #1: loss=62571.509940097705
Epoch #2: loss=29661.719871956422
Epoch #3: loss=18496.4987811143
Epoch #4: loss=12272.1525557828
Epoch #5: loss=9758.31838360732
Epoch #6: loss=8171.321866397527
Epoch #7: loss=6561.42717036775
Epoch #8: loss=6152.3666428786055
Epoch #9: loss=4369.973463186259
Epoch #10: loss=3548.5105368185873
Epoch #11: loss=3859.325684502462
Epoch #12: loss=4262.947236697668
Epoch #13: loss=1910.1906137016808
Epoch #14: loss=3071.276322054804
Epoch #15: loss=3497.343320936482
Epoch #16: loss=1635.3494913938914
Epoch #17: loss=958.3393998441862
Epoch #18: loss=2905.6886720113066
Epoch #19: loss=511.04151269046605
Epoch #20: loss=1613.8868663021117
Epoch #21: loss=336.650642413951
Epoch #22: loss=618.5409828356419
Epoch #23: loss=1707.9568300838803
Epoch #24: loss=310.3280966406129
Epoch #25: loss=739.1338213975022
Epoch #26: loss=132.2578472109054
Epoch #27: loss=210.08313178

In [8]:
# LOSO: frozen TS2Vec — encode only; train MLP on embeddings (same flow as Biobank_nohumidity)
for subject in subjects:
    print(f"\nLOSO fold — test subject: {subject}")
    train_subs = [s for s in subjects if s != subject]
    X_train_imu = np.concatenate([subject_windows[s]["imu"] for s in train_subs], axis=0)
    y_train = np.concatenate([subject_windows[s]["y"] for s in train_subs], axis=0)
    X_test_imu = subject_windows[subject]["imu"]
    y_test = subject_windows[subject]["y"]

    if X_train_imu.shape[0] < 2 or X_test_imu.shape[0] == 0:
        print("  Skipping: not enough data.")
        results.append({"subject": subject, "accuracy": np.nan, "f1_score": np.nan})
        continue

    # Frozen TS2Vec: encode only
    train_repr = ts2vec_model.encode(X_train_imu, encoding_window='full_series')
    test_repr = ts2vec_model.encode(X_test_imu, encoding_window='full_series')

    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_repr)
    X_test = scaler.transform(test_repr)

    y_train_2d = y_train.reshape(-1, 1)
    X_aug, y_aug = augment_data(X_train, y_train)
    X_train = np.vstack([X_train, X_aug])
    y_train_2d = np.vstack([y_train_2d, y_aug])

    X_train, y_flat = SMOTETomek(random_state=42).fit_resample(X_train, y_train_2d.flatten())
    y_train_2d = y_flat.reshape(-1, 1)

    cw = compute_class_weight("balanced", classes=np.unique(y_train_2d), y=y_train_2d.flatten())
    class_weight_dict = {i: float(cw[i]) for i in range(len(cw))}

    model = build_model(X_train.shape[1])
    model.fit(X_train, y_train_2d, validation_data=(X_test, y_test.reshape(-1, 1)), epochs=50, batch_size=32,
              class_weight=class_weight_dict,
              callbacks=[EarlyStopping(patience=5, restore_best_weights=True)], verbose=0)

    y_pred_prob = model.predict(X_test, verbose=0)
    y_pred_raw = (y_pred_prob > 0.5).astype("int32").flatten()
    y_pred = medfilt(y_pred_raw, kernel_size=3)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    results.append({"subject": subject, "accuracy": acc, "f1_score": f1})
    print(f"  Accuracy: {acc:.4f}, F1: {f1:.4f}")



LOSO fold — test subject: 2024-12-04-18-49-30_c5c72868-633a-4672-8bdd-3a457f994ddb


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



  Accuracy: 0.9168, F1: 0.5455

LOSO fold — test subject: 2024-12-08-21-41-18_c1291a19-92af-431e-9608-6044389d26b0
  Accuracy: 0.9170, F1: 0.6503

LOSO fold — test subject: 2024-12-10-19-42-27_4734a243-b638-4004-aa82-c698f3ef7aba
  Accuracy: 0.8945, F1: 0.4737

LOSO fold — test subject: 2025-01-18-13-08-43_449ee30d-3245-47ca-9769-752cf0d2edb7
  Accuracy: 0.9344, F1: 0.5155

LOSO fold — test subject: 2025-01-19-18-41-39_c4d73c9a-93b2-4c1b-9f76-492d76f7731d
  Accuracy: 0.8419, F1: 0.2553

LOSO fold — test subject: 2025-01-19-19-48-01_c2031779-881c-4c5c-9c6e-b3f4d57601a9
  Accuracy: 0.8597, F1: 0.1803

LOSO fold — test subject: 34414785-1f38-4ff1-a709-e3bd0f5e7d42
  Accuracy: 0.9771, F1: 0.7385

LOSO fold — test subject: 383ea87a-3396-400b-9497-ee6f9ad7c093
  Accuracy: 0.9733, F1: 0.6429

LOSO fold — test subject: 6c516a60-1d5e-4d7c-a1dd-158099033fe7
  Accuracy: 0.9676, F1: 0.6914

LOSO fold — test subject: 8bb7b2a8-0d9b-4aaa-ad3a-c15fedb2ad31
  Accuracy: 0.9733, F1: 0.6429

LOSO fold — t

In [9]:
results_df = pd.DataFrame(results)
print("\n" + "="*60)
print("TS2Vec IMU-Only — LOSO Summary (frozen TS2Vec, MLP only per fold)")
print("="*60)
print(results_df.to_string())
valid = results_df.dropna(subset=["f1_score"])
if len(valid) > 0:
    print(f"\nMean Accuracy: {valid['accuracy'].mean():.4f}")
    print(f"Mean F1:      {valid['f1_score'].mean():.4f}")
out_path = os.path.join(save_model_path, "TS2Vec_IMU_Only_frozen.csv")
results_df.to_csv(out_path, index=False)
print(f"\nResults saved to {out_path}")



TS2Vec IMU-Only — LOSO Summary (frozen TS2Vec, MLP only per fold)
                                                     subject  accuracy  f1_score
0   2024-12-04-18-49-30_c5c72868-633a-4672-8bdd-3a457f994ddb  0.916805  0.545455
1   2024-12-08-21-41-18_c1291a19-92af-431e-9608-6044389d26b0  0.917031  0.650307
2   2024-12-10-19-42-27_4734a243-b638-4004-aa82-c698f3ef7aba  0.894459  0.473684
3   2025-01-18-13-08-43_449ee30d-3245-47ca-9769-752cf0d2edb7  0.934449  0.515464
4   2025-01-19-18-41-39_c4d73c9a-93b2-4c1b-9f76-492d76f7731d  0.841867  0.255319
5   2025-01-19-19-48-01_c2031779-881c-4c5c-9c6e-b3f4d57601a9  0.859748  0.180328
6                       34414785-1f38-4ff1-a709-e3bd0f5e7d42  0.977058  0.738462
7                       383ea87a-3396-400b-9497-ee6f9ad7c093  0.973262  0.642857
8                       6c516a60-1d5e-4d7c-a1dd-158099033fe7  0.967617  0.691358
9                       8bb7b2a8-0d9b-4aaa-ad3a-c15fedb2ad31  0.973262  0.642857
10                      8f0ce2c4-d123-4c1c