# TS2Vec + Humidity for Hand Washing Detection (Frozen TS2Vec — like Biobank_humidity)

Uses **TS2Vec** (self-supervised time series representations) for IMU windows and the **same humidity handcrafted features** as Biobank_humidity, then trains a classifier on the concatenated representation.

**Pipeline (aligned with Biobank_humidity.ipynb):**
- Same data: `data/` and `new_data/`, labels from `lables/`, **clean_humidity** applied.
- Same windowing: window_size=500, step_size=250; IMU (acc_x, acc_y, acc_z) + humidity per window.
- Per window: (1) **TS2Vec** encodes the IMU time series → fixed-size embedding. (2) **advanced_humid_features** (11 stats) computed from the window’s humidity. (3) Concatenate [humid_features * 2, ts2vec_embedding], scale, optional augment + SMOTETomek, then train the same MLP (focal loss, class weights).
- LOSO, same excluded_subjects; medfilt on predictions; accuracy and F1 per subject.

In [3]:
import os
import sys
import numpy as np
import pandas as pd
import json
from glob import glob
from scipy.signal import find_peaks, medfilt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.combine import SMOTETomek

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, GaussianNoise
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from ts2vec import TS2Vec



In [5]:
base_path = "/Users/sonalimanoharan/Desktop/scientific_research/hw"
data_folders = ["data", "new_data"]
label_files = ["labels.csv", "lables_new.csv"]
save_model_path = os.path.join(base_path, "ts2vec_saved_model")
os.makedirs(save_model_path, exist_ok=True)

window_size = 500
step_size = 250
imu_cols = ["acc_x", "acc_y", "acc_z"]
humid_col = ["humid"]
REPR_DIMS = 128
TS2VEC_EPOCHS = 50
humidity_weight = 2.0


In [6]:
def clean_humidity(df):
    if "humid" in df.columns:
        artifact_value = 79.1318359375
        df = df.copy()
        df["humid"] = df["humid"].replace(artifact_value, np.nan)
        df["humid"] = df["humid"].interpolate(method='linear', limit_direction='both')
        df["humid"] = df["humid"].ffill().bfill()
    return df

def load_recs():
    all_dfs = []
    for data_folder, label_file in zip(data_folders, label_files):
        data_path = os.path.join(base_path, data_folder, "*.csv")
        label_path = os.path.join(base_path, "lables", os.path.basename(label_file))
        for fname in glob(data_path):
            df = pd.read_csv(fname)
            df = clean_humidity(df)
            subject_id_full = os.path.basename(fname).replace(".csv", "")
            all_dfs.append((fname, df, label_path, subject_id_full, data_folder))
    return all_dfs

def convert_to_binlabel(x):
    return 0 if x in ["Null", "dry"] else 1

def apply_labels(dfs):
    l_dfs = []
    for fname, df, label_path, subject_id, folder in dfs:
        label_df = pd.read_csv(label_path)
        label_df["filename"] = label_df["datetime"].apply(lambda x: os.path.basename(str(x)).strip())
        file_basename = os.path.basename(fname).strip()
        matched_row = label_df[label_df["filename"].apply(lambda x: x.endswith(file_basename))]
        if matched_row.empty:
            continue
        df = df.copy()
        df["label"] = "Null"
        label_info = json.loads(matched_row.iloc[0]["label"])
        for d in label_info:
            df.loc[d["start"]:d["end"], "label"] = d["timeserieslabels"][0]
        df["binlabel"] = df["label"].apply(convert_to_binlabel)
        df["subject"] = subject_id
        df["source_folder"] = folder
        l_dfs.append(df)
    return l_dfs

def advanced_humid_features(humid):
    humid = pd.Series(humid)
    diff = humid.diff().fillna(0)
    peaks, _ = find_peaks(humid, height=55)
    return np.array([
        np.mean(humid), np.std(humid),
        np.max(humid), np.min(humid),
        np.median(humid), np.sum(humid > 50),
        humid.iloc[-1] - humid.iloc[0],
        np.percentile(humid, 90) - np.percentile(humid, 10),
        np.mean(diff), np.std(diff),
        len(peaks)
    ], dtype=np.float32)

def create_windows(df, window_size, step_size):
    """Returns (X_imu_windows, humid_features, labels). Same structure as Biobank_humidity but IMU raw for TS2Vec."""
    imu_windows, humid_features, labels = [], [], []
    for start in range(0, len(df) - window_size + 1, step_size):
        window = df.iloc[start:start + window_size]
        if not all(c in window.columns for c in imu_cols):
            continue
        imu = window[imu_cols].values.astype(np.float32)
        if "humid" in window.columns:
            humid = window["humid"].values.squeeze()
            hf = advanced_humid_features(humid)
        else:
            hf = np.zeros(11, dtype=np.float32)
        label_mode = window["binlabel"].mode()
        lab = label_mode.iloc[0] if not label_mode.empty else int(window["binlabel"].iloc[0])
        imu_windows.append(imu)
        humid_features.append(hf)
        labels.append(lab)
    if not imu_windows:
        return np.zeros((0, window_size, len(imu_cols)), dtype=np.float32), np.zeros((0, 11), dtype=np.float32), np.array([], dtype=np.int64)
    return np.stack(imu_windows, axis=0), np.stack(humid_features, axis=0), np.array(labels, dtype=np.int64)


In [7]:
def augment_data(X, y, humid_dim, augment_ratio=0.5):
    """Augment class 1 only; humid_dim = number of humidity features (first columns)."""
    X_aug, y_aug = [], []
    for i in range(len(X)):
        if y[i] != 1:
            continue
        x_sample = X[i]
        repr_part = x_sample[humid_dim:].copy()
        humid_part = x_sample[:humid_dim]
        if np.random.rand() < augment_ratio:
            repr_part += np.random.normal(0, 0.01, size=repr_part.shape)
        X_aug.append(np.concatenate([humid_part, repr_part]))
        y_aug.append(y[i])
    return np.array(X_aug), np.array(y_aug).reshape(-1, 1)

def focal_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        eps = 1e-7
        y_pred = tf.clip_by_value(y_pred, eps, 1.0 - eps)
        pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        return -tf.reduce_mean(alpha * tf.pow(1. - pt, gamma) * tf.math.log(pt))
    return loss

def build_model(input_dim):
    inp = Input(shape=(input_dim,))
    x = GaussianNoise(0.1)(inp)
    x = Dense(128, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inp, x)
    model.compile(optimizer=Adam(learning_rate=1e-4), loss=focal_loss(), metrics=["accuracy"])
    return model


In [8]:
all_dfs = load_recs()
labeled_dfs = apply_labels(all_dfs)
excluded_subjects = {
    "2025-01-18-22-38-29_37959204-490b-4cd9-b647-94e743071951",
    "2025-01-28-21-43-21_e4380fee-3c78-4e38-936f-acd60513e279"
}
filtered_dfs = [df for df in labeled_dfs if df["subject"].iloc[0] not in excluded_subjects]
subjects = sorted(set(df["subject"].iloc[0] for df in filtered_dfs))
print(f"Found {len(subjects)} subjects (after exclusions)")
results = []


Found 18 subjects (after exclusions)


In [None]:
# Build per-subject windows once, then pretrain TS2Vec on all IMU data (frozen encoder)
subject_windows = {}
for df in filtered_dfs:
    sid = df["subject"].iloc[0]
    if sid not in subject_windows:
        subject_windows[sid] = {"imu": [], "humid": [], "y": []}
    imu_w, humid_f, label = create_windows(df, window_size, step_size)
    subject_windows[sid]["imu"].append(imu_w)
    subject_windows[sid]["humid"].append(humid_f)
    subject_windows[sid]["y"].append(label)

for sid in subject_windows:
    subject_windows[sid]["imu"] = np.concatenate(subject_windows[sid]["imu"], axis=0)
    subject_windows[sid]["humid"] = np.concatenate(subject_windows[sid]["humid"], axis=0)
    subject_windows[sid]["y"] = np.concatenate(subject_windows[sid]["y"], axis=0)

X_all_imu = np.concatenate([subject_windows[s]["imu"] for s in subjects], axis=0)
print(f"Pretraining TS2Vec on {X_all_imu.shape[0]} IMU windows (all subjects) ...")

use_gpu = bool(tf.config.list_physical_devices("GPU"))
device_ts2vec = 0 if use_gpu else 'cpu'
ts2vec_model = TS2Vec(
    input_dims=X_all_imu.shape[2],
    output_dims=REPR_DIMS,
    device=device_ts2vec,
    batch_size=32,
)
ts2vec_model.fit(X_all_imu, n_epochs=TS2VEC_EPOCHS, verbose=True)
print("TS2Vec pretrained and frozen. LOSO will use encode() only.")

Pretraining TS2Vec on 12927 IMU windows (all subjects) ...


In [None]:
# LOSO: frozen TS2Vec — encode only; train MLP on [humid_features * weight, ts2vec_embedding]
for subject in subjects:
    print(f"\nLOSO fold — test subject: {subject}")
    train_subs = [s for s in subjects if s != subject]
    X_train_imu = np.concatenate([subject_windows[s]["imu"] for s in train_subs], axis=0)
    X_train_humid = np.concatenate([subject_windows[s]["humid"] for s in train_subs], axis=0)
    y_train = np.concatenate([subject_windows[s]["y"] for s in train_subs], axis=0)
    X_test_imu = subject_windows[subject]["imu"]
    X_test_humid = subject_windows[subject]["humid"]
    y_test = subject_windows[subject]["y"]

    if X_train_imu.shape[0] < 2 or X_test_imu.shape[0] == 0:
        print("  Skipping: not enough data.")
        results.append({"subject": subject, "accuracy": np.nan, "f1_score": np.nan})
        continue

    # Frozen TS2Vec: encode only (no training)
    train_repr = ts2vec_model.encode(X_train_imu, encoding_window='full_series')
    test_repr = ts2vec_model.encode(X_test_imu, encoding_window='full_series')

    # Same concatenation as Biobank_humidity: [humid_features * weight, imu_repr]
    X_train_humid_scaled = X_train_humid * humidity_weight
    X_test_humid_scaled = X_test_humid * humidity_weight
    X_train = np.concatenate([X_train_humid_scaled, train_repr], axis=1)
    X_test = np.concatenate([X_test_humid_scaled, test_repr], axis=1)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    humid_dim = X_train_humid.shape[1]
    y_train_2d = y_train.reshape(-1, 1)
    X_aug, y_aug = augment_data(X_train, y_train, humid_dim=humid_dim)
    X_train = np.vstack([X_train, X_aug])
    y_train_2d = np.vstack([y_train_2d, y_aug])

    X_train, y_train_2d = SMOTETomek(random_state=42).fit_resample(X_train, y_train_2d.flatten())
    y_train_2d = y_train_2d.reshape(-1, 1)

    cw = compute_class_weight("balanced", classes=np.unique(y_train_2d), y=y_train_2d.flatten())
    class_weight_dict = {i: float(cw[i]) for i in range(len(cw))}

    model = build_model(X_train.shape[1])
    model.fit(X_train, y_train_2d, validation_data=(X_test, y_test.reshape(-1, 1)), epochs=50, batch_size=32,
              class_weight=class_weight_dict,
              callbacks=[EarlyStopping(patience=5, restore_best_weights=True)], verbose=0)

    y_pred_prob = model.predict(X_test, verbose=0)
    y_pred_raw = (y_pred_prob > 0.5).astype("int32").flatten()
    y_pred = medfilt(y_pred_raw, kernel_size=3)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    results.append({"subject": subject, "accuracy": acc, "f1_score": f1})
    print(f"{subject} - Accuracy: {acc:.4f}, F1: {f1:.4f}")


In [None]:
results_df = pd.DataFrame(results)
print("\n" + "="*60)
print("TS2Vec + Humidity — LOSO Summary (frozen TS2Vec, MLP only per fold)")
print("="*60)
print(results_df.to_string())
valid = results_df.dropna(subset=["f1_score"])
if len(valid) > 0:
    print(f"\nMean Accuracy: {valid['accuracy'].mean():.4f}")
    print(f"Mean F1:      {valid['f1_score'].mean():.4f}")
out_path = os.path.join(save_model_path, "TS2Vec_Humidity_frozen.csv")
results_df.to_csv(out_path, index=False)
print(f"\nResults saved to {out_path}")
