In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/k/wasupandceacar/wasupandceacar/cmi-metric/__results__.html
/kaggle/input/k/wasupandceacar/wasupandceacar/cmi-metric/__package_validation_results__.txt
/kaggle/input/k/wasupandceacar/wasupandceacar/cmi-metric/__notebook__.ipynb
/kaggle/input/k/wasupandceacar/wasupandceacar/cmi-metric/__output__.json
/kaggle/input/k/wasupandceacar/wasupandceacar/cmi-metric/custom.css
/kaggle/input/k/wasupandceacar/wasupandceacar/cmi-metric/package/kagglehub_requirements.yaml
/kaggle/input/k/wasupandceacar/wasupandceacar/cmi-metric/package/core.py
/kaggle/input/k/wasupandceacar/wasupandceacar/cmi-metric/package/__init__.py
/kaggle/input/cmi-model-3-label-encoder/label_encoder.pkl
/kaggle/input/cmi-model-3-utilities/train_universe.csv
/kaggle/input/cmi-model-3-utilities/best_fold3.pt
/kaggle/input/cmi-model-3-utilities/best_fold2.pt
/kaggle/input/cmi-model-3-utilities/best_fold0.pt
/kaggle/input/cmi-model-3-utilities/best_fold1.pt
/kaggle/input/cmi-model-3-utilities/best_fold4.pt
/kaggle/inp

# **Libraries**

In [3]:
import os
import torch
import kagglehub
from pathlib import Path
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from scipy.spatial.transform import Rotation as R
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm.notebook import tqdm
from torch.amp import autocast
import pandas as pd
import polars as pl
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler, LabelEncoder
from transformers import BertConfig, BertModel
import time
import joblib

2025-09-01 10:52:52.819625: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756723972.991027      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756723973.039141      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# **Preprocessing Functions**

In [4]:
# Removing the gravity
def remove_gravity_from_acc(acc_data, rot_data):
    if isinstance(acc_data, pd.DataFrame):
        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values
    else:
        acc_values = acc_data
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data
    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    gravity_world = np.array([0, 0, 9.81])
    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i, :] = acc_values[i, :] 
            continue
        try:
            rotation = R.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
             linear_accel[i, :] = acc_values[i, :]
    return linear_accel

In [5]:
# Calclate angular velocity
def calculate_angular_velocity_from_quat(rot_data, time_delta=1/200): # Assuming 200Hz sampling rate
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data
    num_samples = quat_values.shape[0]
    angular_vel = np.zeros((num_samples, 3))
    for i in range(num_samples - 1):
        q_t = quat_values[i]
        q_t_plus_dt = quat_values[i+1]
        if np.all(np.isnan(q_t)) or np.all(np.isclose(q_t, 0)) or \
           np.all(np.isnan(q_t_plus_dt)) or np.all(np.isclose(q_t_plus_dt, 0)):
            continue
        try:
            rot_t = R.from_quat(q_t)
            rot_t_plus_dt = R.from_quat(q_t_plus_dt)
            delta_rot = rot_t.inv() * rot_t_plus_dt
            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta
        except ValueError:
            pass
    return angular_vel

In [6]:
# Calculate Angular Distance
def calculate_angular_distance(rot_data):
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data
    num_samples = quat_values.shape[0]
    angular_dist = np.zeros(num_samples)
    for i in range(num_samples - 1):
        q1 = quat_values[i]
        q2 = quat_values[i+1]
        if np.all(np.isnan(q1)) or np.all(np.isclose(q1, 0)) or \
           np.all(np.isnan(q2)) or np.all(np.isclose(q2, 0)):
            angular_dist[i] = 0
            continue
        try:
            r1 = R.from_quat(q1)
            r2 = R.from_quat(q2)
            relative_rotation = r1.inv() * r2
            angle = np.linalg.norm(relative_rotation.as_rotvec())
            angular_dist[i] = angle
        except ValueError:
            angular_dist[i] = 0 
            pass
    return angular_dist

# **Dataset Classes and Functions**

This `CMIFeDataset` class is a **PyTorch Dataset** that:

1. **Loads raw CSV data** (with IMU, thermal, and ToF sensor readings) and **config settings**.
2. **Initializes feature names** (engineered features for IMU, raw & aggregated stats for ToF, etc.).
3. **Generates or loads engineered features** if they’re not already precomputed:

   * IMU features: magnitudes, jerks, rotation angles, angular velocities/distances, gravity removal, etc.
   * ToF features: per-sensor statistics (mean, std, min, max) and optionally region-based aggregation.
4. **Encodes labels** (`gesture`) into integers and one-hot vectors.
5. **Handles missing data**:

   * Fills with forward/backward fill if configured.
   * Replaces remaining NaNs with a synthetic “nan value” that is later scaled consistently.
6. **Scales features** (either all at once or per-sensor type) using `StandardScaler`.
7. **Pads sequences** to a fixed length (`pad_len`, chosen as the 95th percentile of sequence lengths).
8. **Stores ready-to-train tensors** for IMU, thermal, and ToF data plus labels & class weights.
9. Provides:

   * **`__getitem__`** for PyTorch DataLoader compatibility.
   * **`inference_process`** for preparing new sequences in the exact same way as training data.
   * Utilities for getting scaled NaN tensors.

Basically — it’s a **complete dataset preparation pipeline** for a multimodal time-series classification task, handling:

* 1) Feature engineering
* 2) Missing data 
* 3) Normalization
* 4) Padding
* 5) Label encoding
* 6) Train/inference consistency 

In [7]:
data_path = '/kaggle/input/cmi-detect-behavior-with-sensor-data/train.csv'

**Steps to Convert Raw Dataset → Required Format**

In [8]:
# import warnings
# from tqdm import tqdm

# -------------------------
# Helper functions (user-provided, slightly adapted)
# -------------------------
def remove_gravity_from_acc(acc_data, rot_data):
    """
    acc_data: DataFrame or ndarray with columns ['acc_x','acc_y','acc_z']
    rot_data: DataFrame or ndarray with columns ['rot_x','rot_y','rot_z','rot_w']
    Returns ndarray (N,3) linear acceleration (gravity removed) in sensor frame.
    """
    if isinstance(acc_data, pd.DataFrame):
        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values.astype(float)
    else:
        acc_values = np.asarray(acc_data, dtype=float)

    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values.astype(float)
    else:
        quat_values = np.asarray(rot_data, dtype=float)

    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    gravity_world = np.array([0.0, 0.0, 9.81], dtype=float)

    for i in range(num_samples):
        q = quat_values[i]
        if q is None or np.any(np.isnan(q)) or np.allclose(q, 0.0):
            linear_accel[i, :] = acc_values[i, :]
            continue
        try:
            rotation = R.from_quat(q)
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except Exception:
            linear_accel[i, :] = acc_values[i, :]
    return linear_accel


def angular_velocity_from_quat_with_dt(quat_array, times=None, default_fs=200.0):
    """
    quat_array: (N,4) array in order (rot_x,rot_y,rot_z,rot_w)
    times: optional (N,) timestamps in seconds (or ms; function will normalize)
    default_fs: fallback sampling rate in Hz
    Returns angular_vel (N,3) where last row is zero.
    """
    quat_array = np.asarray(quat_array, dtype=float)
    N = quat_array.shape[0]
    ang_vel = np.zeros((N, 3), dtype=float)
    if N < 2:
        return ang_vel

    if times is not None:
        t = np.asarray(times, dtype=float)
        # heuristic: if timestamps look like epoch ms -> convert
        if np.nanmean(t) > 1e6:
            t = t / 1000.0
        dt = np.diff(t)
        # handle non-positive dt
        pos = dt[dt > 0]
        if pos.size == 0:
            median_dt = 1.0 / default_fs
        else:
            median_dt = np.median(pos)
        dt = np.where(dt > 0, dt, median_dt)
    else:
        dt = np.full(N-1, 1.0 / default_fs)

    for i in range(N-1):
        q_t = quat_array[i]
        q_tp1 = quat_array[i+1]
        if (np.any(np.isnan(q_t)) or np.allclose(q_t, 0.0)) or \
           (np.any(np.isnan(q_tp1)) or np.allclose(q_tp1, 0.0)):
            continue
        try:
            r_t = R.from_quat(q_t)
            r_tp1 = R.from_quat(q_tp1)
            delta = r_t.inv() * r_tp1
            rotvec = delta.as_rotvec()
            ang_vel[i, :] = rotvec / dt[i]
        except Exception:
            # keep zeros on failure
            pass
    return ang_vel


def calculate_angular_distance_with_times(quat_array, times=None):
    """
    Compute per-sample angular distance (angle between consecutive orientations).
    Returns array shape (N,) with last element 0.
    """
    quat_array = np.asarray(quat_array, dtype=float)
    N = quat_array.shape[0]
    ang_dist = np.zeros(N, dtype=float)
    if N < 2:
        return ang_dist

    for i in range(N-1):
        q1 = quat_array[i]
        q2 = quat_array[i+1]
        if (np.any(np.isnan(q1)) or np.allclose(q1, 0.0)) or \
           (np.any(np.isnan(q2)) or np.allclose(q2, 0.0)):
            ang_dist[i] = 0.0
            continue
        try:
            r1 = R.from_quat(q1)
            r2 = R.from_quat(q2)
            relative = r1.inv() * r2
            ang_dist[i] = np.linalg.norm(relative.as_rotvec())
        except Exception:
            ang_dist[i] = 0.0
    return ang_dist

# -------------------------
# Vectorized ToF region stats (avoids fragmentation/warnings)
# -------------------------
def compute_tof_region_stats_matrix(subdf_mat, modes):
    """
    subdf_mat: numpy array shape (N,64) with NaNs for missing values
    modes: list of ints, e.g. [4] or [2,4,8,16,32]
    Returns dict mapping column_name -> ndarray (N,)
    """
    new_cols = {}
    mat = subdf_mat.astype(float)  # (N,64)
    with np.errstate(invalid='ignore'):
        new_cols["mean"] = np.nanmean(mat, axis=1)
        new_cols["std"]  = np.nanstd(mat, axis=1)
        new_cols["min"]  = np.nanmin(mat, axis=1)
        new_cols["max"]  = np.nanmax(mat, axis=1)

    flat = mat.reshape(-1, 64)
    for mode in modes:
        if mode <= 0:
            continue
        region_size = 64 // mode if mode>0 and 64%mode==0 else max(1, 64 // mode)
        for r in range(mode):
            start = r * region_size
            end = start + region_size if (r < mode-1) else 64
            region_vals = flat[:, start:end]
            with np.errstate(all='ignore'):
                mean_v = np.nanmean(region_vals, axis=1)
                std_v  = np.nanstd(region_vals, axis=1)
                min_v  = np.nanmin(region_vals, axis=1)
                max_v  = np.nanmax(region_vals, axis=1)
            # keys will be filled by caller with tofid prefix
            new_cols[f"r{mode}_{r}_mean"] = mean_v
            new_cols[f"r{mode}_{r}_std"]  = std_v
            new_cols[f"r{mode}_{r}_min"]  = min_v
            new_cols[f"r{mode}_{r}_max"]  = max_v
    return new_cols


def add_all_tof_features_vectorized(df, tof_mode):
    """
    For each tof sensor 1..5, compute:
      tof_{i}_mean/std/min/max
      and for each mode in modes: tof{mode}_{i}_region_{r}_{stat}
    Returns a new DataFrame = pd.concat([df, new_cols_df], axis=1)
    """
    if tof_mode == 0:
        modes = []
    elif tof_mode == -1:
        modes = [2,4,8,16,32]
    else:
        modes = [tof_mode]

    all_new = {}
    N = len(df)
    for tof_id in range(1, 6):
        tof_cols = [f"tof_{tof_id}_v{p}" for p in range(64)]
        # ensure columns exist (create NaN if missing)
        for c in tof_cols:
            if c not in df.columns:
                df[c] = np.nan

        subdf = df[tof_cols].astype(float).replace(-1, np.nan)
        mat = subdf.values  # (N,64)
        new = compute_tof_region_stats_matrix(mat, modes)
        # prefix names and add to all_new
        all_new[f"tof_{tof_id}_mean"] = new["mean"]
        all_new[f"tof_{tof_id}_std"]  = new["std"]
        all_new[f"tof_{tof_id}_min"]  = new["min"]
        all_new[f"tof_{tof_id}_max"]  = new["max"]
        for mode in modes:
            for r in range(mode):
                all_new[f"tof{mode}_{tof_id}_region_{r}_mean"] = new[f"r{mode}_{r}_mean"]
                all_new[f"tof{mode}_{tof_id}_region_{r}_std"]  = new[f"r{mode}_{r}_std"]
                all_new[f"tof{mode}_{tof_id}_region_{r}_min"]  = new[f"r{mode}_{r}_min"]
                all_new[f"tof{mode}_{tof_id}_region_{r}_max"]  = new[f"r{mode}_{r}_max"]

    # concat once
    new_df = pd.concat([df, pd.DataFrame(all_new, index=df.index)], axis=1)
    return new_df

# -------------------------
# Utility: estimate per-sequence timestamps (seconds)
# -------------------------
def get_sequence_times(group_df, time_col='timestamp'):
    """
    Returns times array in seconds if available, else None.
    If timestamps appear to be epoch-ms, convert to seconds.
    """
    if time_col in group_df.columns:
        times = group_df[time_col].astype(float).values
        if np.nanmean(times) > 1e6:
            times = times / 1000.0
        return times
    return None

# -------------------------
# Main pipeline
# -------------------------
def make_universe_csv(raw_csv_path,
                      out_csv_path="universe.csv",
                      tof_mode=16,
                      default_fs=200.0,
                      time_col='timestamp',
                      tof_raw_keep=True):
    """
    raw_csv_path: path to raw csv (one row per timestamp). Must include sequence_id and sensor columns.
    out_csv_path: final CSV file for CMIFeDataset
    tof_mode: same behavior as CMIFeDataset (0, >1, or -1)
    default_fs: fallback sampling rate for angular velocity if no timestamps present
    time_col: name of timestamp column if present (optional)
    tof_raw_keep: if True keep raw tof_*_v* columns (recommended)
    """
    print("Loading raw CSV:", raw_csv_path)
    df_raw = pd.read_csv(raw_csv_path)
    print("Raw shape:", df_raw.shape)

    # Prepare base: ensure sequence_id present
    if 'sequence_id' not in df_raw.columns:
        raise ValueError("Raw CSV must contain 'sequence_id' column.")

    df = df_raw.copy()

    # ensure base columns exist to avoid KeyError downstream
    for col in ['acc_x','acc_y','acc_z','rot_x','rot_y','rot_z','rot_w']:
        if col not in df.columns:
            df[col] = np.nan
    # ensure thermal columns exist: thm_1..thm_5
    for i in range(1,6):
        c = f"thm_{i}"
        if c not in df.columns:
            df[c] = np.nan

    # ----- IMU: acc_mag, rot_angle (vectorized) -----
    with np.errstate(invalid='ignore'):
        df['acc_mag'] = np.sqrt(df['acc_x'].astype(float)**2 + df['acc_y'].astype(float)**2 + df['acc_z'].astype(float)**2)
        df['rot_angle'] = 2 * np.arccos(df['rot_w'].astype(float).clip(-1,1))

    # compute acc_mag_jerk and rot_angle_vel by group (vectorized groupby diff)
    df['acc_mag_jerk'] = df.groupby('sequence_id')['acc_mag'].diff().fillna(0)
    df['rot_angle_vel'] = df.groupby('sequence_id')['rot_angle'].diff().fillna(0)

    # ----- Linear accel (remove gravity) per sequence -----
    N = len(df)
    lin_x = np.full(N, np.nan, dtype=float)
    lin_y = np.full(N, np.nan, dtype=float)
    lin_z = np.full(N, np.nan, dtype=float)

    if all(c in df.columns for c in ['acc_x','acc_y','acc_z','rot_x','rot_y','rot_z','rot_w']):
        print("Computing gravity-removed linear acceleration per sequence...")
        for seq_id, group in tqdm(df.groupby('sequence_id'), desc="linear_acc"):
            idx = group.index
            acc_sub = group[['acc_x','acc_y','acc_z']].astype(float)
            rot_sub = group[['rot_x','rot_y','rot_z','rot_w']].astype(float)
            la = remove_gravity_from_acc(acc_sub, rot_sub)
            lin_x[idx] = la[:,0]
            lin_y[idx] = la[:,1]
            lin_z[idx] = la[:,2]
    else:
        warnings.warn("Rotation or accelerometer columns missing; using raw acc with gravity approx.")
        if 'acc_x' in df.columns:
            lin_x[:] = df['acc_x'].astype(float).fillna(0.0)
        if 'acc_y' in df.columns:
            lin_y[:] = df['acc_y'].astype(float).fillna(0.0)
        if 'acc_z' in df.columns:
            lin_z[:] = df['acc_z'].astype(float).fillna(0.0) - 9.81

    df['linear_acc_x'] = lin_x
    df['linear_acc_y'] = lin_y
    df['linear_acc_z'] = lin_z
    df['linear_acc_mag'] = np.sqrt(np.nan_to_num(df['linear_acc_x'])**2 + np.nan_to_num(df['linear_acc_y'])**2 + np.nan_to_num(df['linear_acc_z'])**2)
    df['linear_acc_mag_jerk'] = df.groupby('sequence_id')['linear_acc_mag'].diff().fillna(0)

    # ----- Angular velocity & angular distance by sequence using timestamps if available -----
    N = len(df)
    av_x = np.zeros(N, dtype=float)
    av_y = np.zeros(N, dtype=float)
    av_z = np.zeros(N, dtype=float)
    ang_dist = np.zeros(N, dtype=float)

    if all(c in df.columns for c in ['rot_x','rot_y','rot_z','rot_w']):
        print("Computing angular velocity & angular distance per sequence...")
        for seq_id, group in tqdm(df.groupby('sequence_id'), desc="ang_vel"):
            idx = group.index
            quat_arr = group[['rot_x','rot_y','rot_z','rot_w']].astype(float).values
            times = get_sequence_times(group, time_col=time_col)  # None if missing
            ang_vel = angular_velocity_from_quat_with_dt(quat_arr, times=times, default_fs=default_fs)
            ad = calculate_angular_distance_with_times(quat_arr, times=times)
            av_x[idx] = ang_vel[:,0]
            av_y[idx] = ang_vel[:,1]
            av_z[idx] = ang_vel[:,2]
            ang_dist[idx] = ad
    else:
        warnings.warn("Rotation quaternion columns missing -> angular velocity/distance set to 0.")

    df['angular_vel_x'] = av_x
    df['angular_vel_y'] = av_y
    df['angular_vel_z'] = av_z
    df['angular_distance'] = ang_dist

    # Fill any remaining NaNs in engineered IMU with zeros (ensures columns exist)
    imu_engineered = ['acc_mag','rot_angle','acc_mag_jerk','rot_angle_vel',
                      'linear_acc_x','linear_acc_y','linear_acc_z',
                      'linear_acc_mag','linear_acc_mag_jerk',
                      'angular_vel_x','angular_vel_y','angular_vel_z',
                      'angular_distance']
    for c in imu_engineered:
        if c not in df.columns:
            df[c] = 0.0
        else:
            df[c] = df[c].fillna(0.0)

    # ----- Thermal basic stats (optional) -----
    thm_cols = [f"thm_{i}" for i in range(1,6)]
    present_thm = [c for c in thm_cols if c in df.columns]
    if present_thm:
        df['thm_mean'] = df[present_thm].astype(float).mean(axis=1)
        df['thm_std']  = df[present_thm].astype(float).std(axis=1).fillna(0.0)
    else:
        warnings.warn("Thermal columns thm_1..thm_5 not found.")

    # ----- ToF features (vectorized, concat once) -----
    print("Computing ToF features (this may take a moment for large files)...")
    df = add_all_tof_features_vectorized(df, tof_mode)

    # Optionally drop raw tof columns (not recommended for baseline)
    if not tof_raw_keep:
        for tof_id in range(1,6):
            for p in range(64):
                c = f"tof_{tof_id}_v{p}"
                if c in df.columns:
                    df.drop(columns=[c], inplace=True)

    # Final save
    print("Final dataframe shape:", df.shape)
    df.to_csv(out_csv_path, index=False)
    print("Saved universe CSV to:", out_csv_path)
    return df


In [9]:
if __name__ == "__main__": 
    RAW_CSV = data_path      
    OUT_CSV = "/kaggle/working/train_universe.csv"     
    TOF_MODE = 16                  # set to same value your dataset config uses (e.g. 4,8,16) or -1 for [2,4,8,16,32]
    DEFAULT_FS = 200.0             

    # run
    universe_df = make_universe_csv(RAW_CSV, OUT_CSV, tof_mode=TOF_MODE, default_fs=DEFAULT_FS, time_col='timestamp', tof_raw_keep=True)


Loading raw CSV: /kaggle/input/cmi-detect-behavior-with-sensor-data/train.csv
Raw shape: (574945, 341)
Computing gravity-removed linear acceleration per sequence...


linear_acc:   0%|          | 0/8151 [00:00<?, ?it/s]

Computing angular velocity & angular distance per sequence...


ang_vel:   0%|          | 0/8151 [00:00<?, ?it/s]

Computing ToF features (this may take a moment for large files)...


  new_cols["mean"] = np.nanmean(mat, axis=1)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  new_cols["min"]  = np.nanmin(mat, axis=1)
  new_cols["max"]  = np.nanmax(mat, axis=1)
  mean_v = np.nanmean(region_vals, axis=1)
  min_v  = np.nanmin(region_vals, axis=1)
  max_v  = np.nanmax(region_vals, axis=1)
  new_cols["mean"] = np.nanmean(mat, axis=1)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  new_cols["min"]  = np.nanmin(mat, axis=1)
  new_cols["max"]  = np.nanmax(mat, axis=1)
  mean_v = np.nanmean(region_vals, axis=1)
  min_v  = np.nanmin(region_vals, axis=1)
  max_v  = np.nanmax(region_vals, axis=1)
  new_cols["mean"] = np.nanmean(mat, axis=1)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  new_cols["min"]  = np.nanmin(mat, axis=1)
  new_cols["max"]  = np.nanmax(mat, axis=1)
  mean_v = np.nanmean(region_vals, axis=1)
  min_v  = np.nanmin(region_vals, axis=1)
  max_v  = np.nanmax(region_vals, axis=1)
  new_cols["mean"] = np.nanmean(ma

Final dataframe shape: (574945, 696)
Saved universe CSV to: /kaggle/working/train_universe.csv


In [10]:
universe_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,row_id,sequence_type,sequence_id,sequence_counter,subject,orientation,behavior,phase,gesture,acc_x,...,tof16_5_region_13_min,tof16_5_region_13_max,tof16_5_region_14_mean,tof16_5_region_14_std,tof16_5_region_14_min,tof16_5_region_14_max,tof16_5_region_15_mean,tof16_5_region_15_std,tof16_5_region_15_min,tof16_5_region_15_max
0,SEQ_000007_000000,Target,SEQ_000007,0,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.683594,...,,,,,,,,,,
1,SEQ_000007_000001,Target,SEQ_000007,1,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.949219,...,,,,,,,,,,
2,SEQ_000007_000002,Target,SEQ_000007,2,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,5.722656,...,,,115.5,3.5,112.0,119.0,,,,
3,SEQ_000007_000003,Target,SEQ_000007,3,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.601562,...,,,106.0,5.0,101.0,111.0,,,,
4,SEQ_000007_000004,Target,SEQ_000007,4,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,5.566406,...,,,111.666667,9.977753,101.0,125.0,,,,


In [24]:
# CMIFEDataset Class inheriting base Dataset Class
class CMIFeDataset(Dataset):
    def __init__(self, data_path, config):
        self.config = config
        self.init_feature_names(data_path)
        df = self.generate_features(pd.read_csv(data_path, usecols=set(self.base_cols+self.feature_cols)))
        self.generate_dataset(df)

    def init_feature_names(self, data_path):
        self.imu_engineered_features = [
            'acc_mag', 'rot_angle',
            'acc_mag_jerk', 'rot_angle_vel',
            'linear_acc_mag', 'linear_acc_mag_jerk',
            'angular_vel_x', 'angular_vel_y', 'angular_vel_z',
            'angular_distance'
        ]

        self.tof_mode = self.config.get("tof_mode", "stats")
        self.tof_region_stats = ['mean', 'std', 'min', 'max']
        self.tof_cols = self.generate_tof_feature_names()

        columns = pd.read_csv(data_path, nrows=0).columns.tolist()
        imu_cols_base = ['linear_acc_x', 'linear_acc_y', 'linear_acc_z']
        imu_cols_base.extend([c for c in columns if c.startswith('rot_') and c not in ['rot_angle', 'rot_angle_vel']])
        self.imu_cols = list(dict.fromkeys(imu_cols_base + self.imu_engineered_features))
        self.thm_cols = [c for c in columns if c.startswith('thm_')]
        self.feature_cols = self.imu_cols + self.thm_cols + self.tof_cols
        self.imu_dim = len(self.imu_cols)
        self.thm_dim = len(self.thm_cols)
        self.tof_dim = len(self.tof_cols)
        self.base_cols = ['acc_x', 'acc_y', 'acc_z',
                          'rot_x', 'rot_y', 'rot_z', 'rot_w',
                          'sequence_id', 'subject', 
                          'sequence_type', 'gesture', 'orientation'] + [c for c in columns if c.startswith('thm_')] + [f"tof_{i}_v{p}" for i in range(1, 6) for p in range(64)]
        self.fold_cols = ['subject', 'sequence_type', 'gesture', 'orientation']

    def generate_tof_feature_names(self):
        features = []
        if self.config.get("tof_raw", False):
            for i in range(1, 6):
                features.extend([f"tof_{i}_v{p}" for p in range(64)])
        for i in range(1, 6):
            if self.tof_mode != 0:
                for stat in self.tof_region_stats:
                    features.append(f'tof_{i}_{stat}')
                if self.tof_mode > 1:
                    for r in range(self.tof_mode):
                        for stat in self.tof_region_stats:
                            features.append(f'tof{self.tof_mode}_{i}_region_{r}_{stat}')
                if self.tof_mode == -1:
                    for mode in [2, 4, 8, 16, 32]:
                        for r in range(mode):
                            for stat in self.tof_region_stats:
                                features.append(f'tof{mode}_{i}_region_{r}_{stat}')
        return features

    def compute_features(self, df):
        df['acc_mag'] = np.sqrt(df['acc_x']**2 + df['acc_y']**2 + df['acc_z']**2)
        df['rot_angle'] = 2 * np.arccos(df['rot_w'].clip(-1, 1))
        df['acc_mag_jerk'] = df.groupby('sequence_id')['acc_mag'].diff().fillna(0)
        df['rot_angle_vel'] = df.groupby('sequence_id')['rot_angle'].diff().fillna(0)
            
        linear_accel_list = []
        for _, group in df.groupby('sequence_id'):
            acc_data_group = group[['acc_x', 'acc_y', 'acc_z']]
            rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
            linear_accel_group = remove_gravity_from_acc(acc_data_group, rot_data_group)
            linear_accel_list.append(pd.DataFrame(linear_accel_group, columns=['linear_acc_x', 'linear_acc_y', 'linear_acc_z'], index=group.index))
        df_linear_accel = pd.concat(linear_accel_list)
        df = pd.concat([df, df_linear_accel], axis=1)
        df['linear_acc_mag'] = np.sqrt(df['linear_acc_x']**2 + df['linear_acc_y']**2 + df['linear_acc_z']**2)
        df['linear_acc_mag_jerk'] = df.groupby('sequence_id')['linear_acc_mag'].diff().fillna(0)
    
        angular_vel_list = []
        for _, group in df.groupby('sequence_id'):
            rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
            angular_vel_group = calculate_angular_velocity_from_quat(rot_data_group)
            angular_vel_list.append(pd.DataFrame(angular_vel_group, columns=['angular_vel_x', 'angular_vel_y', 'angular_vel_z'], index=group.index))
        df_angular_vel = pd.concat(angular_vel_list)
        df = pd.concat([df, df_angular_vel], axis=1)
    
        angular_distance_list = []
        for _, group in df.groupby('sequence_id'):
            rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
            angular_dist_group = calculate_angular_distance(rot_data_group)
            angular_distance_list.append(pd.DataFrame(angular_dist_group, columns=['angular_distance'], index=group.index))
        df_angular_distance = pd.concat(angular_distance_list)
        df = pd.concat([df, df_angular_distance], axis=1)

        if self.tof_mode != 0:
            new_columns = {}
            for i in range(1, 6):
                pixel_cols = [f"tof_{i}_v{p}" for p in range(64)]
                tof_data = df[pixel_cols].replace(-1, np.nan)
                new_columns.update({
                    f'tof_{i}_mean': tof_data.mean(axis=1),
                    f'tof_{i}_std': tof_data.std(axis=1),
                    f'tof_{i}_min': tof_data.min(axis=1),
                    f'tof_{i}_max': tof_data.max(axis=1)
                })
                if self.tof_mode > 1:
                    region_size = 64 // self.tof_mode
                    for r in range(self.tof_mode):
                        region_data = tof_data.iloc[:, r*region_size : (r+1)*region_size]
                        new_columns.update({
                            f'tof{self.tof_mode}_{i}_region_{r}_mean': region_data.mean(axis=1),
                            f'tof{self.tof_mode}_{i}_region_{r}_std': region_data.std(axis=1),
                            f'tof{self.tof_mode}_{i}_region_{r}_min': region_data.min(axis=1),
                            f'tof{self.tof_mode}_{i}_region_{r}_max': region_data.max(axis=1)
                        })
                if self.tof_mode == -1:
                    for mode in [2, 4, 8, 16, 32]:
                        region_size = 64 // mode
                        for r in range(mode):
                            region_data = tof_data.iloc[:, r*region_size : (r+1)*region_size]
                            new_columns.update({
                                f'tof{mode}_{i}_region_{r}_mean': region_data.mean(axis=1),
                                f'tof{mode}_{i}_region_{r}_std': region_data.std(axis=1),
                                f'tof{mode}_{i}_region_{r}_min': region_data.min(axis=1),
                                f'tof{mode}_{i}_region_{r}_max': region_data.max(axis=1)
                            })
            df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)
        return df
        
    def generate_features(self, df):
        self.le = LabelEncoder()
        df['gesture_int'] = self.le.fit_transform(df['gesture'])
        self.class_num = len(self.le.classes_)

        joblib.dump(self.le, "label_encoder.pkl")
        
        if all(c in df.columns for c in self.imu_engineered_features) and all(c in df.columns for c in self.tof_cols):
            print("Have precomputed, skip compute.")
        else:
            print("Not precomputed, do compute.")
            df = self.compute_features(df)

        if self.config.get("save_precompute", False):
            df.to_csv(self.config.get("save_filename", "train.csv"))
        return df

    def scale(self, data_unscaled):
        scaler_function = self.config.get("scaler_function", StandardScaler())
        scaler = scaler_function.fit(np.concatenate(data_unscaled, axis=0))
        return [scaler.transform(x) for x in data_unscaled], scaler

    def pad(self, data_scaled, cols):
        pad_data = np.zeros((len(data_scaled), self.pad_len, len(cols)), dtype='float32')
        for i, seq in enumerate(data_scaled):
            seq_len = min(len(seq), self.pad_len)
            pad_data[i, :seq_len] = seq[:seq_len]
        return pad_data

    def get_nan_value(self, data, ratio):
        max_value = data.max().max()
        nan_value = -max_value * ratio
        return nan_value

    def generate_dataset(self, df):
        seq_gp = df.groupby('sequence_id') 
        imu_unscaled, thm_unscaled, tof_unscaled = [], [], []
        classes, lens = [], []
        self.imu_nan_value = self.get_nan_value(df[self.imu_cols], self.config["nan_ratio"]["imu"])
        self.thm_nan_value = self.get_nan_value(df[self.thm_cols], self.config["nan_ratio"]["thm"])
        self.tof_nan_value = self.get_nan_value(df[self.tof_cols], self.config["nan_ratio"]["tof"])

        self.fold_feats = defaultdict(list)
        for seq_id, seq_df in seq_gp:
            imu_data = seq_df[self.imu_cols]
            if self.config["fbfill"]["imu"]:
                imu_data = imu_data.ffill().bfill()
            imu_unscaled.append(imu_data.fillna(self.imu_nan_value).values.astype('float32'))

            thm_data = seq_df[self.thm_cols]
            if self.config["fbfill"]["thm"]:
                thm_data = thm_data.ffill().bfill()
            thm_unscaled.append(thm_data.fillna(self.thm_nan_value).values.astype('float32'))

            tof_data = seq_df[self.tof_cols]
            if self.config["fbfill"]["tof"]:
                tof_data = tof_data.ffill().bfill()
            tof_unscaled.append(tof_data.fillna(self.tof_nan_value).values.astype('float32'))
            
            classes.append(seq_df['gesture_int'].iloc[0])
            lens.append(len(imu_data))

            for col in self.fold_cols:
                self.fold_feats[col].append(seq_df[col].iloc[0])
            
        self.dataset_indices = classes
        self.pad_len = int(np.percentile(lens, self.config.get("percent", 95)))
        if self.config.get("one_scale", True):
            x_unscaled = [np.concatenate([imu, thm, tof], axis=1) for imu, thm, tof in zip(imu_unscaled, thm_unscaled, tof_unscaled)]
            x_scaled, self.x_scaler = self.scale(x_unscaled)
            x = self.pad(x_scaled, self.imu_cols+self.thm_cols+self.tof_cols)
            self.imu = x[..., :self.imu_dim]
            self.thm = x[..., self.imu_dim:self.imu_dim+self.thm_dim]
            self.tof = x[..., self.imu_dim+self.thm_dim:self.imu_dim+self.thm_dim+self.tof_dim]
        else:
            imu_scaled, self.imu_scaler = self.scale(imu_unscaled)
            thm_scaled, self.thm_scaler = self.scale(thm_unscaled)
            tof_scaled, self.tof_scaler = self.scale(tof_unscaled)
            self.imu = self.pad(imu_scaled, self.imu_cols)
            self.thm = self.pad(thm_scaled, self.thm_cols)
            self.tof = self.pad(tof_scaled, self.tof_cols)
        self.precompute_scaled_nan_values()
        self.class_ = F.one_hot(torch.from_numpy(np.array(classes)).long(), num_classes=len(self.le.classes_)).float().numpy()
        self.class_weight = torch.FloatTensor(compute_class_weight('balanced', classes=np.arange(len(self.le.classes_)), y=classes))

    def precompute_scaled_nan_values(self):
        dummy_df = pd.DataFrame(
            np.array([[self.imu_nan_value]*len(self.imu_cols) + 
                     [self.thm_nan_value]*len(self.thm_cols) +
                     [self.tof_nan_value]*len(self.tof_cols)]),
            columns=self.imu_cols + self.thm_cols + self.tof_cols
        )
        
        if self.config.get("one_scale", True):
            scaled = self.x_scaler.transform(dummy_df)
            self.imu_scaled_nan = scaled[0, :self.imu_dim].mean()
            self.thm_scaled_nan = scaled[0, self.imu_dim:self.imu_dim+self.thm_dim].mean()
            self.tof_scaled_nan = scaled[0, self.imu_dim+self.thm_dim:self.imu_dim+self.thm_dim+self.tof_dim].mean()
        else:
            self.imu_scaled_nan = self.imu_scaler.transform(dummy_df[self.imu_cols])[0].mean()
            self.thm_scaled_nan = self.thm_scaler.transform(dummy_df[self.thm_cols])[0].mean()
            self.tof_scaled_nan = self.tof_scaler.transform(dummy_df[self.tof_cols])[0].mean()

    def get_scaled_nan_tensors(self, imu, thm, tof):
        return torch.full(imu.shape, self.imu_scaled_nan, device=imu.device), \
            torch.full(thm.shape, self.thm_scaled_nan, device=thm.device), \
            torch.full(tof.shape, self.tof_scaled_nan, device=tof.device)

    def inference_process(self, sequence):
        df_seq = sequence#.to_pandas().copy()
        if not all(c in df_seq.columns for c in self.imu_engineered_features):
            df_seq['acc_mag'] = np.sqrt(df_seq['acc_x']**2 + df_seq['acc_y']**2 + df_seq['acc_z']**2)
            df_seq['rot_angle'] = 2 * np.arccos(df_seq['rot_w'].clip(-1, 1))
            df_seq['acc_mag_jerk'] = df_seq['acc_mag'].diff().fillna(0)
            df_seq['rot_angle_vel'] = df_seq['rot_angle'].diff().fillna(0)
            if all(col in df_seq.columns for col in ['acc_x', 'acc_y', 'acc_z', 'rot_x', 'rot_y', 'rot_z', 'rot_w']):
                linear_accel = remove_gravity_from_acc(
                    df_seq[['acc_x', 'acc_y', 'acc_z']], 
                    df_seq[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
                )
                df_seq[['linear_acc_x', 'linear_acc_y', 'linear_acc_z']] = linear_accel
            else:
                df_seq['linear_acc_x'] = df_seq.get('acc_x', 0)
                df_seq['linear_acc_y'] = df_seq.get('acc_y', 0)
                df_seq['linear_acc_z'] = df_seq.get('acc_z', 0)
            df_seq['linear_acc_mag'] = np.sqrt(df_seq['linear_acc_x']**2 + df_seq['linear_acc_y']**2 + df_seq['linear_acc_z']**2)
            df_seq['linear_acc_mag_jerk'] = df_seq['linear_acc_mag'].diff().fillna(0)
            if all(col in df_seq.columns for col in ['rot_x', 'rot_y', 'rot_z', 'rot_w']):
                angular_vel = calculate_angular_velocity_from_quat(df_seq[['rot_x', 'rot_y', 'rot_z', 'rot_w']])
                df_seq[['angular_vel_x', 'angular_vel_y', 'angular_vel_z']] = angular_vel
            else:
                df_seq[['angular_vel_x', 'angular_vel_y', 'angular_vel_z']] = 0
            if all(col in df_seq.columns for col in ['rot_x', 'rot_y', 'rot_z', 'rot_w']):
                df_seq['angular_distance'] = calculate_angular_distance(df_seq[['rot_x', 'rot_y', 'rot_z', 'rot_w']])
            else:
                df_seq['angular_distance'] = 0

        if self.tof_mode != 0:
            new_columns = {} 
            for i in range(1, 6):
                pixel_cols = [f"tof_{i}_v{p}" for p in range(64)]
                tof_data = df_seq[pixel_cols].replace(-1, np.nan)
                new_columns.update({
                    f'tof_{i}_mean': tof_data.mean(axis=1),
                    f'tof_{i}_std': tof_data.std(axis=1),
                    f'tof_{i}_min': tof_data.min(axis=1),
                    f'tof_{i}_max': tof_data.max(axis=1)
                })
                if self.tof_mode > 1:
                    region_size = 64 // self.tof_mode
                    for r in range(self.tof_mode):
                        region_data = tof_data.iloc[:, r*region_size : (r+1)*region_size]
                        new_columns.update({
                            f'tof{self.tof_mode}_{i}_region_{r}_mean': region_data.mean(axis=1),
                            f'tof{self.tof_mode}_{i}_region_{r}_std': region_data.std(axis=1),
                            f'tof{self.tof_mode}_{i}_region_{r}_min': region_data.min(axis=1),
                            f'tof{self.tof_mode}_{i}_region_{r}_max': region_data.max(axis=1)
                        })
                if self.tof_mode == -1:
                    for mode in [2, 4, 8, 16, 32]:
                        region_size = 64 // mode
                        for r in range(mode):
                            region_data = tof_data.iloc[:, r*region_size : (r+1)*region_size]
                            new_columns.update({
                                f'tof{mode}_{i}_region_{r}_mean': region_data.mean(axis=1),
                                f'tof{mode}_{i}_region_{r}_std': region_data.std(axis=1),
                                f'tof{mode}_{i}_region_{r}_min': region_data.min(axis=1),
                                f'tof{mode}_{i}_region_{r}_max': region_data.max(axis=1)
                            })
            df_seq = pd.concat([df_seq, pd.DataFrame(new_columns)], axis=1)
        
        imu_unscaled = df_seq[self.imu_cols]
        if self.config["fbfill"]["imu"]:
            imu_unscaled = imu_unscaled.ffill().bfill()
        imu_unscaled = imu_unscaled.fillna(self.imu_nan_value).values.astype('float32')

        thm_unscaled = df_seq[self.thm_cols]
        if self.config["fbfill"]["thm"]:
            thm_unscaled = thm_unscaled.ffill().bfill()
        thm_unscaled = thm_unscaled.fillna(self.thm_nan_value).values.astype('float32')

        tof_unscaled = df_seq[self.tof_cols]
        if self.config["fbfill"]["tof"]:
            tof_unscaled = tof_unscaled.ffill().bfill()
        tof_unscaled = tof_unscaled.fillna(self.tof_nan_value).values.astype('float32')
        
        if self.config.get("one_scale", True):
            x_unscaled = np.concatenate([imu_unscaled, thm_unscaled, tof_unscaled], axis=1)
            x_scaled = self.x_scaler.transform(x_unscaled)
            imu_scaled = x_scaled[..., :self.imu_dim]
            thm_scaled = x_scaled[..., self.imu_dim:self.imu_dim+self.thm_dim]
            tof_scaled = x_scaled[..., self.imu_dim+self.thm_dim:self.imu_dim+self.thm_dim+self.tof_dim]
        else:
            imu_scaled = self.imu_scaler.transform(imu_unscaled)
            thm_scaled = self.thm_scaler.transform(thm_unscaled)
            tof_scaled = self.tof_scaler.transform(tof_unscaled)

        combined = np.concatenate([imu_scaled, thm_scaled, tof_scaled], axis=1)
        padded = np.zeros((self.pad_len, combined.shape[1]), dtype='float32')
        seq_len = min(combined.shape[0], self.pad_len)
        padded[:seq_len] = combined[:seq_len]
        imu = padded[..., :self.imu_dim]
        thm = padded[..., self.imu_dim:self.imu_dim+self.thm_dim]
        tof = padded[..., self.imu_dim+self.thm_dim:self.imu_dim+self.thm_dim+self.tof_dim]
        
        return torch.from_numpy(imu).float().unsqueeze(0), torch.from_numpy(thm).float().unsqueeze(0), torch.from_numpy(tof).float().unsqueeze(0)

    def __getitem__(self, idx):
        return self.imu[idx], self.thm[idx], self.tof[idx], self.class_[idx]

    def __len__(self):
        return len(self.class_)

---

This class is a **cross-validation wrapper** around a full dataset (`CMIFeDataset`).

### What it does:

1. **Initializes**:

   * Builds the full dataset using `full_dataset_function` (e.g., `CMIFeDataset`).
   * Stores feature dimensions (IMU, thermal, ToF), label encoder, class names, and class weights.
   * Creates a **StratifiedKFold** object for *n*-fold cross-validation (preserving class balance in each split).
   * Generates all fold train/validation index splits in advance.

2. **Provides**:

   * `get_fold_datasets(fold_idx)` → returns PyTorch `Subset` objects for training and validation for a given fold.
   * `print_fold_stats()` → prints how many samples per class are in train/valid sets for each fold (helps verify balanced splits).

---

✅ **Purpose**: Makes it easy to run stratified *n*-fold cross-validation on a dataset while keeping all feature preprocessing identical.
✅ **Key benefit**: Automatically handles train/validation splitting with label balance and quick class distribution inspection.

---


In [25]:
# CMIFoldDataset for StratifiedKFold
class CMIFoldDataset:
    def __init__(self, data_path, config, full_dataset_function, n_folds=5, random_seed=42):
        self.full_dataset = full_dataset_function(data_path=data_path, config=config)
        self.imu_dim = self.full_dataset.imu_dim
        self.thm_dim = self.full_dataset.thm_dim
        self.tof_dim = self.full_dataset.tof_dim
        self.le = self.full_dataset.le
        self.class_names = self.full_dataset.le.classes_
        self.class_weight = self.full_dataset.class_weight
        self.n_folds = n_folds
        self.skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
        self.folds = list(self.skf.split(np.arange(len(self.full_dataset)), np.array(self.full_dataset.dataset_indices)))
    
    def get_fold_datasets(self, fold_idx):
        if self.folds is None or fold_idx >= self.n_folds:
            return None, None
        fold_train_idx, fold_valid_idx = self.folds[fold_idx]
        return Subset(self.full_dataset, fold_train_idx), Subset(self.full_dataset, fold_valid_idx)

    def print_fold_stats(self):
        def get_label_counts(subset):
            counts = {name: 0 for name in self.class_names}
            if subset is None:
                return counts
            for idx in subset.indices:
                label_idx = self.full_dataset.dataset_indices[idx]
                counts[self.class_names[label_idx]] += 1
            return counts
        
        print("\nCross-validation fold statistics:")
        for fold_idx in range(self.n_folds):
            train_fold, valid_fold = self.get_fold_datasets(fold_idx)
            train_counts = get_label_counts(train_fold)
            valid_counts = get_label_counts(valid_fold)
                
            print(f"\nFold {fold_idx + 1}:")
            print(f"{'name':<50} {'train_counts':<10} {'valid_counts':<10}")
            for name in self.class_names:
                print(f"{name:<50} {train_counts[name]:<10} {valid_counts[name]:<10}")

Here’s the side-by-side diagram showing how **`CMIFeDataset`** and **`CMIFoldDataset`** relate:

---

**Flow Diagram — Full Pipeline**

```
                ┌──────────────────────────────────┐
                │  CMIFeDataset                     │
                │----------------------------------│
                │  1. Load CSV (raw sensor data)    │
                │  2. Init feature names            │
                │  3. Feature engineering           │
                │     - IMU engineered features     │
                │     - ToF aggregated stats        │
                │  4. Handle missing data           │
                │  5. Scale & pad sequences         │
                │  6. Encode labels + weights       │
                │  7. Store tensors (imu/thm/tof)   │
                └──────────────────────────────────┘
                              │
                              ▼
        (Full processed dataset: ready for model training)
                              │
                              ▼
                ┌──────────────────────────────────┐
                │  CMIFoldDataset                   │
                │----------------------------------│
                │  1. Takes CMIFeDataset as input   │
                │  2. Uses StratifiedKFold to split │
                │     into n folds (balanced)       │
                │  3. Stores train/valid indices    │
                │  4. get_fold_datasets(fold_idx)   │
                │     → Returns PyTorch Subsets     │
                │  5. print_fold_stats() shows      │
                │     per-class sample counts       │
                └──────────────────────────────────┘
```

**Relationship**:

* **`CMIFeDataset`** = **full data preparation engine** (raw CSV → clean, scaled, padded tensors).
* **`CMIFoldDataset`** = **cross-validation manager** (splits the already-prepared dataset into balanced folds).

---


# **Model Construction**

**Classes**

This `SEBlock` is a **Squeeze-and-Excitation** module for 1D features.

**Short explanation:**
It learns **channel-wise attention weights** to emphasize important feature channels and suppress less useful ones.

**Steps:**

1. **Squeeze:** `adaptive_avg_pool1d` reduces each channel to a single value → global average per channel.
2. **Excitation:** Two small fully connected layers (with reduction ratio) learn how important each channel is.
3. **Scale:** A `sigmoid` gives weights in `[0, 1]` for each channel, which are multiplied back with the original `x`.

Effect: Channels that help the task get boosted, and noisy channels get weakened.


In [26]:
class SEBlock(nn.Module):
    def __init__(self, channels, reduction = 8):
        super().__init__()
        self.fc1 = nn.Linear(channels, channels // reduction, bias=True)
        self.fc2 = nn.Linear(channels // reduction, channels, bias=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x: (B, C, L)
        se = F.adaptive_avg_pool1d(x, 1).squeeze(-1)      # -> (B, C)
        se = F.relu(self.fc1(se), inplace=True)          # -> (B, C//r)
        se = self.sigmoid(self.fc2(se)).unsqueeze(-1)    # -> (B, C, 1)
        return x * se                

This `ResNetSEBlock` is basically a **1D ResNet block with a Squeeze-and-Excitation (SE) module**.

**Short explanation:**
It applies two convolution layers with batch normalization and ReLU, recalibrates channel importance using **SEBlock**, adds a residual (shortcut) connection, and applies ReLU again.

**Steps:**

1. **Main path:**

   * Conv1D → BN → ReLU
   * Conv1D → BN
   * SEBlock to apply channel attention.
2. **Shortcut path:**

   * If input/output channels differ → 1×1 Conv + BN to match dimensions.
   * Else, pass identity directly.
3. **Add & Activate:**

   * Sum main path output and shortcut.
   * Apply final ReLU.

**Effect:**
Keeps the benefits of residual learning (easy gradient flow, deeper networks) while also letting the network focus on **important channels** via the SE module.


In [27]:
class ResNetSEBlock(nn.Module):
    def __init__(self, in_channels, out_channels, wd = 1e-4):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels,
                               kernel_size=3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.conv2 = nn.Conv1d(out_channels, out_channels,
                               kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm1d(out_channels)
        # SE
        self.se = SEBlock(out_channels)
        
        if in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size=1,
                          padding=0, bias=False),
                nn.BatchNorm1d(out_channels)
            )
        else:
            self.shortcut = nn.Identity()

        self.relu = nn.ReLU(inplace=True)

    def forward(self, x) :
        identity = self.shortcut(x)              # (B, out, L)
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = self.se(out)                       # (B, out, L)
        out = out + identity
        return self.relu(out)

This `CMIModel` is a **multi-branch deep learning model** that processes **three different sensor inputs** (IMU, thermal (thm), and time-of-flight (tof)) separately, fuses them, and uses a **BERT-based transformer** for sequence modeling before classification.

---

### **Short explanation**

1. **Three feature extraction branches:**

   * **IMU branch** → uses multiple `ResNetSEBlock`s for deep feature extraction with channel attention (SE).
   * **THM branch** → two Conv1D + BN + ReLU + MaxPool layers.
   * **TOF branch** → similar to THM branch.

2. **Fusion:**

   * After extracting features, concatenates them along the **feature dimension**.

3. **BERT for sequence modeling:**

   * Adds a trainable `cls_token` (like in Vision Transformers).
   * Passes the sequence through a BERT encoder (custom config).
   * Uses the output of the `cls_token` position as the **global representation**.

4. **Classification:**

   * A multi-layer fully connected classifier converts the BERT output into final class scores.

---

### **Data flow (forward pass)**

1. **IMU input** `(B, L, imu_dim)` → `permute` to `(B, imu_dim, L)` → goes through **IMU branch** (ResNetSE + pooling + dropout) → `(B, feat_dim, reduced_L)`.
2. **THM input** `(B, L, thm_dim)` → similar but with Conv1D layers → `(B, feat_dim, reduced_L)`.
3. **TOF input** `(B, L, tof_dim)` → similar Conv1D layers → `(B, feat_dim, reduced_L)`.
4. **Concat features** → `(B, reduced_L, feat_dim_total)` → transformer input format.
5. **Add `cls_token`** and feed into **BERT** → `(B, reduced_L+1, feat_dim)`.
6. **Take `cls_token` output** → classifier → final `(B, n_classes)` logits.

---

**Effect:**
This model combines **CNNs for local feature extraction**, **SE blocks for channel attention**, and **BERT for long-range temporal dependencies**, making it suitable for **multi-sensor sequence classification**.

---

In [28]:
# CMIModel Class
class CMIModel(nn.Module):
    def __init__(self, imu_dim, thm_dim, tof_dim, n_classes, **kwargs):
        super().__init__()
        self.imu_branch = nn.Sequential(
            self.residual_se_cnn_block(imu_dim, kwargs["imu1_channels"], kwargs["imu1_layers"],
                                       drop=kwargs["imu1_dropout"]),
            self.residual_se_cnn_block(kwargs["imu1_channels"], kwargs["feat_dim"], kwargs["imu2_layers"],
                                       drop=kwargs["imu2_dropout"])
        )

        self.thm_branch = nn.Sequential(
            nn.Conv1d(thm_dim, kwargs["thm1_channels"], kernel_size=3, padding=1, bias=False),
            nn.BatchNorm1d(kwargs["thm1_channels"]),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2, ceil_mode=True),
            nn.Dropout(kwargs["thm1_dropout"]),
            
            nn.Conv1d(kwargs["thm1_channels"], kwargs["feat_dim"], kernel_size=3, padding=1, bias=False),
            nn.BatchNorm1d(kwargs["feat_dim"]),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2, ceil_mode=True),
            nn.Dropout(kwargs["thm2_dropout"])
        )
        
        self.tof_branch = nn.Sequential(
            nn.Conv1d(tof_dim, kwargs["tof1_channels"], kernel_size=3, padding=1, bias=False),
            nn.BatchNorm1d(kwargs["tof1_channels"]),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2, ceil_mode=True),
            nn.Dropout(kwargs["tof1_dropout"]),
            
            nn.Conv1d(kwargs["tof1_channels"], kwargs["feat_dim"], kernel_size=3, padding=1, bias=False),
            nn.BatchNorm1d(kwargs["feat_dim"]),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2, ceil_mode=True),
            nn.Dropout(kwargs["tof2_dropout"])
        )

        self.cls_token = nn.Parameter(torch.zeros(1, 1, kwargs["feat_dim"]))
        self.bert = BertModel(BertConfig(
            hidden_size=kwargs["feat_dim"],
            num_hidden_layers=kwargs["bert_layers"],
            num_attention_heads=kwargs["bert_heads"],
            intermediate_size=kwargs["feat_dim"]*4
        ))
        
        self.classifier = nn.Sequential(
            nn.Linear(kwargs["feat_dim"], kwargs["cls1_channels"], bias=False),
            nn.BatchNorm1d(kwargs["cls1_channels"]),
            nn.ReLU(inplace=True),
            nn.Dropout(kwargs["cls1_dropout"]),
            nn.Linear(kwargs["cls1_channels"], kwargs["cls2_channels"], bias=False),
            nn.BatchNorm1d(kwargs["cls2_channels"]),
            nn.ReLU(inplace=True),
            nn.Dropout(kwargs["cls2_dropout"]),
            nn.Linear(kwargs["cls2_channels"], n_classes)
        )
    
    def residual_se_cnn_block(self, in_channels, out_channels, num_layers, pool_size=2, drop=0.3, wd=1e-4):
        return nn.Sequential(
            *[ResNetSEBlock(in_channels=in_channels, out_channels=in_channels) for i in range(num_layers)],
            ResNetSEBlock(in_channels, out_channels, wd=wd),
            nn.MaxPool1d(pool_size),
            nn.Dropout(drop)
        )
    
    def forward(self, imu, thm, tof):
        imu_feat = self.imu_branch(imu.permute(0, 2, 1))
        thm_feat = self.thm_branch(thm.permute(0, 2, 1))
        tof_feat = self.tof_branch(tof.permute(0, 2, 1))
        
        bert_input = torch.cat([imu_feat, thm_feat, tof_feat], dim=-1).permute(0, 2, 1)
        cls_token = self.cls_token.expand(bert_input.size(0), -1, -1)  # (B,1,H)
        bert_input = torch.cat([cls_token, bert_input], dim=1)  # (B,T+1,H)
        outputs = self.bert(inputs_embeds=bert_input)
        pred_cls = outputs.last_hidden_state[:, 0, :]

        return self.classifier(pred_cls)


# **Calling Dataset Creation**

In [29]:
CUDA0 = "cuda:0"
seed = 42
batch_size = 64
num_workers = 4
n_folds = 5

# universe_csv_path = Path("/kaggle/input/cmi-precompute/pytorch/all/1/tof-1_raw.csv")
# universe_csv_path = Path("/kaggle/input/cmi-detect-behavior-with-sensor-data/train.csv")
universe_csv_path = Path("/kaggle/working/train_universe.csv")

deterministic = kagglehub.package_import('wasupandceacar/deterministic').deterministic
deterministic.init_all(seed)

**init_dataset() ties them together**

When you run:
```
    dataset = CMIFoldDataset(..., full_dataset_function=CMIFeDataset)
```

the sequence is:


1. CMIFoldDataset calls CMIFeDataset(data_path, config)→ Loads and preprocesses the entire dataset once.
2. Splits that processed dataset into n_folds (cross-validation).
3. Prints fold stats.
4. Returns the fold-managed dataset object.


```

 CSV file (raw)
   │
   ▼
 CMIFeDataset (load + clean + scale + process)
   │ processed full dataset
   ▼
 CMIFoldDataset (split into n_folds for CV)
   │
   ├── fold 0: train / val
   ├── fold 1: train / val
   ├── ...
   └── fold n: train / val

```


In [30]:
def init_dataset():
    dataset_config = {
        "percent": 95,
        "scaler_function": StandardScaler(),
        "nan_ratio": {
            "imu": 0,
            "thm": 0,
            "tof": 0,
        },
        "fbfill": {
            "imu": True,
            "thm": True,
            "tof": True,
        },
        "one_scale": True,
        "tof_raw": True,
        "tof_mode": 16,
        "save_precompute": False,
    }
    dataset = CMIFoldDataset(universe_csv_path, dataset_config,
                             n_folds=n_folds, random_seed=seed, full_dataset_function=CMIFeDataset)
    dataset.print_fold_stats()
    return dataset

**get_fold_dataset :**

* Calls dataset.get_fold_datasets(fold) → returns (train_dataset, valid_dataset) for that fold.
* Ignores training dataset (_) and keeps only the validation dataset.
* Wraps it in a PyTorch DataLoader for batching and parallel loading:

**batch_size = how many samples per batch.
num_workers = parallel threads for loading data.
shuffle=False → validation order is fixed.**


* Returns this valid_loader.



In [31]:
def get_fold_dataset(dataset, fold):
    _, valid_dataset = dataset.get_fold_datasets(fold)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
    return valid_loader

In [32]:
dataset = init_dataset()

model_function = CMIModel
model_args = {"feat_dim": 500,
              "imu1_channels": 219, "imu1_dropout": 0.2946731587132302, "imu2_dropout": 0.2697745571929592,
              "imu1_weight_decay": 0.0014824054650601245, "imu2_weight_decay": 0.002742543773142381,
              "imu1_layers": 0, "imu2_layers": 0,
              "thm1_channels": 82, "thm1_dropout": 0.2641274454844702, "thm2_dropout": 0.302896343020985, 
              "tof1_channels": 82, "tof1_dropout": 0.2641274454844702, "tof2_dropout": 0.3028963430209852, 
              "bert_layers": 8, "bert_heads": 10,
              "cls1_channels": 937, "cls2_channels": 303, "cls1_dropout": 0.2281834512400508, "cls2_dropout": 0.22502521934558461}
model_args.update({
    "imu_dim": dataset.full_dataset.imu_dim, 
    "thm_dim": dataset.full_dataset.thm_dim,
    "tof_dim": dataset.full_dataset.tof_dim,
    "n_classes": dataset.full_dataset.class_num})
# model_dir = Path("/kaggle/input/cmi-models-public/pytorch/train_fold_model05_tof16_raw/1")
model_dir = Path("/kaggle/input/cmi-model-3-utilities")

model_dicts = [
    {
        "model_function": model_function,
        "model_args": model_args,
        # "model_path": model_dir / f"fold{fold}/best_ema.pt",
        # "model_path": f"/kaggle/working/models/Bert_Model_CMI_{fold}.pkl",
        "model_path": model_dir / f"best_fold{fold}.pt",
    } for fold in range(n_folds)
]

Have precomputed, skip compute.

Cross-validation fold statistics:

Fold 1:
name                                               train_counts valid_counts
Above ear - pull hair                              511        127       
Cheek - pinch skin                                 509        128       
Drink from bottle/cup                              129        32        
Eyebrow - pull hair                                510        128       
Eyelash - pull hair                                512        128       
Feel around in tray and pull out an object         129        32        
Forehead - pull hairline                           512        128       
Forehead - scratch                                 512        128       
Glasses on/off                                     128        33        
Neck - pinch skin                                  512        128       
Neck - scratch                                     512        128       
Pinch knee/leg skin                         



# **Model Training**

In [20]:
# dataset.get_fold_datasets(0)

In [33]:
TARGET_GESTURE_NAMES = list(universe_df['gesture'].astype(str))

In [34]:
def set_seed(seed: int):
    """Make experiments reproducible (best-effort)."""
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Torch deterministic settings (may slow training)
    try:
        # Newer PyTorch: prefer strict deterministic algorithms
        torch.use_deterministic_algorithms(True)
    except Exception:
        try:
            torch.set_deterministic(True)
        except Exception:
            pass
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [35]:
import random 

dataset_obj = dataset
model_fn = model_function
model_args = model_args

# Provide target_gesture_names if not auto-detectable from dataset.
# Replace with actual target gesture names from contest if needed (strings) OR integer ids.
TARGET_GESTURE_NAMES = globals().get("TARGET_GESTURE_NAMES", None)   # <-- set manually if needed

# Training hyperparams - change if needed
batch_size = globals().get("batch_size", 64)
num_workers = globals().get("num_workers", 4)
n_folds = globals().get("n_folds", 5)
seed = globals().get("seed", 0)
epochs = 32
lr = 5e-4
weight_decay = 1e-4
save_dir = Path("checkpoints")
save_dir.mkdir(exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ========= utilities & metric functions =========
# def set_seed(s):
#     random.seed(s)
#     np.random.seed(s)
#     torch.manual_seed(s)
#     if torch.cuda.is_available():
#         torch.cuda.manual_seed_all(s)

def compute_contest_metrics(y_true, y_pred, target_labels):
    """
    y_true, y_pred: iterables (either ints or strings) same length
    target_labels: set/list of labels considered 'target' (same representation as labels)
    returns dict: binary_f1, gesture_macro_f1, contest_score
    """
    y_true = np.array(y_true, dtype=object)
    y_pred = np.array(y_pred, dtype=object)
    target_set = set(target_labels)

    # Binary mapping: 1 if target, else 0
    y_true_bin = np.isin(y_true, list(target_set)).astype(int)
    y_pred_bin = np.isin(y_pred, list(target_set)).astype(int)

    # Binary F1 guards
    if y_true_bin.sum() == 0 and y_pred_bin.sum() == 0:
        binary_f1 = 1.0
    elif y_true_bin.sum() == 0 and y_pred_bin.sum() > 0:
        binary_f1 = 0.0
    else:
        binary_f1 = f1_score(y_true_bin, y_pred_bin, average='binary', pos_label=1)

    # Collapse non-target labels into single token
    NON_TARGET = "__NON_TARGET__"
    def collapse_label(l):
        return l if l in target_set else NON_TARGET

    y_true_coll = np.array([collapse_label(x) for x in y_true], dtype=object)
    y_pred_coll = np.array([collapse_label(x) for x in y_pred], dtype=object)

    # gesture macro f1 with guard
    if len(np.unique(y_true_coll)) == 1 and (y_true_coll == y_pred_coll).all():
        gesture_macro_f1 = 1.0
    else:
        gesture_macro_f1 = f1_score(y_true_coll, y_pred_coll, average='macro')

    contest_score = 0.5 * (binary_f1 + gesture_macro_f1)
    return {"binary_f1": float(binary_f1), "gesture_macro_f1": float(gesture_macro_f1), "contest_score": float(contest_score)}

# ========= training / validation functions =========
def make_loaders(dataset_obj, fold_idx):
    """Expect dataset_obj.get_fold_datasets(fold_idx) -> (train_dataset, valid_dataset)"""
    train_ds, valid_ds = dataset_obj.get_fold_datasets(fold_idx)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    return train_loader, valid_loader

def train_one_epoch(model, loader, optimizer, criterion, device, use_amp=False):
    model.train()
    total_loss = 0.0
    preds = []
    trues = []
    #scaler = torch.cuda.amp.GradScaler() if use_amp else None
    scaler = torch.amp.GradScaler("cuda") if use_amp else None

    for batch in tqdm(loader, desc="train", leave=False):
        # expecting batch -> (imu, thm, tof, y)
        imu, thm, tof, y = batch
        imu = imu.to(device)
        thm = thm.to(device)
        tof = tof.to(device)
        y = y.to(device)#.long()

        # Ensure target is class indices (not one-hot)
        # if y.ndim > 1 and y.size(1) > 1:
        #     y = y.argmax(dim=1)
        # y = y.long()
        #changes comment

        optimizer.zero_grad()
        if use_amp:
            with torch.cuda.amp.autocast():
                logits = model(imu, thm, tof)
                loss = criterion(logits, y)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(imu, thm, tof)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * imu.size(0)
        preds.extend(torch.argmax(logits, dim=1).detach().cpu().numpy().tolist())
        trues.extend(y.detach().cpu().numpy().tolist())

    avg_loss = total_loss / len(loader.dataset)
    try:
        # if labels are integers, compute macro F1 on ids
        train_f1 = f1_score(trues, preds, average='macro')
    except Exception:
        train_f1 = 0.0
    return avg_loss, train_f1

def validate_and_score(model, loader, criterion, device, target_labels, id2label=None):
    model.eval()
    total_loss = 0.0
    preds = []
    trues = []

    with torch.no_grad():
        for batch in tqdm(loader, desc="valid", leave=False):
            imu, thm, tof, y = batch
            imu = imu.to(device)
            thm = thm.to(device)
            tof = tof.to(device)
            y = y.to(device)#.long()  changes comment

            logits = model(imu, thm, tof)
            loss = criterion(logits, y)
            total_loss += loss.item() * imu.size(0)

            batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
            batch_trues = y.cpu().argmax(axis=1).numpy()

            if id2label is not None:
                batch_preds = [id2label[int(p)] for p in batch_preds]
                batch_trues = [id2label[int(t)] for t in batch_trues]
            else:
                batch_preds = [int(p) for p in batch_preds]
                batch_trues = [int(t) for t in batch_trues]

            preds.extend(batch_preds)
            trues.extend(batch_trues)

    avg_loss = total_loss / len(loader.dataset)
    metrics = compute_contest_metrics(trues, preds, target_labels)
    return avg_loss, metrics

# ========= main training loop =========
def train_all_folds(num_folds=n_folds):
    set_seed(seed)
    overall_start = time.time()

    # Try to infer id2label and target set from dataset object
    id2label = None
    inferred_target_labels = None

    # attempt common attribute locations
    fd = getattr(dataset_obj, "full_dataset", None)
    if fd is None:
        fd = dataset_obj

    # id2label mapping?
    id2label = getattr(fd, "id2label", None) or getattr(fd, "idx2label", None) or getattr(fd, "index_to_label", None)

    # target labels either explicit in dataset or user-supplied
    if TARGET_GESTURE_NAMES:
        inferred_target_labels = TARGET_GESTURE_NAMES
    else:
        # common metadata names
        inferred_target_labels = getattr(fd, "target_labels", None) or getattr(fd, "target_gestures", None) or getattr(fd, "targets", None)

    if inferred_target_labels is None:
        # if id2label exists AND you have a set of names you want as target you could map them.
        raise RuntimeError("TARGET gestures not found. Please set TARGET_GESTURE_NAMES to a list of target gestures (strings), or ensure dataset.full_dataset.target_labels exists.")

    print(f"Using device={device}; batch_size={batch_size}; epochs={epochs}; folds={num_folds}")
    print(f"Detected id2label mapping: {bool(id2label)}; detected target set length: {len(inferred_target_labels)}")

    for fold in range(num_folds):
        print(f"\n=== Fold {fold} training ===")
        train_loader, valid_loader = make_loaders(dataset_obj, fold)
        model = model_fn(**model_args)
        model = model.to(device)

        # Use class weights if provided by dataset; else plain CE
        class_weights = None
        try:
            weights = getattr(fd, "class_weight", None) or getattr(fd, "class_weights", None)
            if weights is not None:
                class_weights = torch.tensor(weights, dtype=torch.float32).to(device)
        except Exception:
            class_weights = None

        criterion = nn.CrossEntropyLoss(weight=class_weights) if class_weights is not None else nn.CrossEntropyLoss()
        # optimizer
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max(1, epochs//2))

        best_metric = -1.0
        best_epoch = -1

        for epoch in range(1, epochs+1):
            start = time.time()
            train_loss, train_f1 = train_one_epoch(model, train_loader, optimizer, criterion, device, use_amp=True)
            val_loss, val_metrics = validate_and_score(model, valid_loader, criterion, device, inferred_target_labels, id2label=id2label)
            scheduler.step()

            epoch_time = time.time() - start
            print(f"Fold{fold} Epoch {epoch:02d}/{epochs} | time {epoch_time:.1f}s | train_loss {train_loss:.4f} train_macroF1 {train_f1:.4f} | val_loss {val_loss:.4f} | binaryF1 {val_metrics['binary_f1']:.4f} | gestureMacroF1 {val_metrics['gesture_macro_f1']:.4f} | contest {val_metrics['contest_score']:.4f}")

            # save best by contest score
            if val_metrics['contest_score'] > best_metric:
                best_metric = val_metrics['contest_score']
                best_epoch = epoch
                ckpt = {
                    "epoch": epoch,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "val_metrics": val_metrics,
                    "model_args": model_args
                }
                # torch.save(ckpt, save_dir / f"best_fold{fold}.pt")
                torch.save(ckpt, f"best_fold{fold}.pt")
                # joblib.dump(ckpt,f"/kaggle/working/models/Bert_Model_CMI_{fold}.pkl")
                # joblib.dump(ckpt,save_dir / f"Bert_Model_CMI_{fold}.pkl")

        print(f"Fold {fold} finished. Best contest score {best_metric:.4f} at epoch {best_epoch}")

    total_time = time.time() - overall_start
    print(f"\nAll folds done in {total_time/60:.1f} minutes.")


In [None]:
# ========== run/training ==========
if __name__ == "__main__":
    train_all_folds()

In [36]:
models2 = list()
for model_dict in model_dicts:
    model_function = model_dict["model_function"]
    model_args = model_dict["model_args"]
    model_path = model_dict["model_path"]
    model = model_function(**model_args).to(CUDA0)
    ckpt = torch.load(model_path)
    state_dict = {k.replace("_orig_mod.", ""): v for k, v in ckpt["model_state_dict"].items()}
    model.load_state_dict(state_dict)
    model = model.eval()
    models2.append(model)