In [None]:
# Cell 1: Imports and Configurations (GPU Optimized)
import os
import json
import joblib
import numpy as np
import pandas as pd
import random
import math
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.cuda.amp import autocast, GradScaler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from timm.scheduler import CosineLRScheduler
from scipy.signal import firwin
from scipy.fft import fft
from scipy.spatial.transform import Rotation as R
from tqdm import tqdm
import polars as pl
from collections import Counter
from typing import List, Dict
import glob

# GPU Optimization Settings
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False  # For better performance

# Check GPU capability and enable features accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    gpu_capability = torch.cuda.get_device_capability(0)
    gpu_capability_major = gpu_capability[0]
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Capability: {gpu_capability[0]}.{gpu_capability[1]}")
    
    # Enable TF32 only on Ampere and newer (capability >= 8.0)
    if gpu_capability_major >= 8:
        torch.backends.cudnn.allow_tf32 = True
        torch.backends.cuda.matmul.allow_tf32 = True
        print("✓ TF32 enabled for Ampere+ GPU")
    else:
        torch.backends.cudnn.allow_tf32 = False
        torch.backends.cuda.matmul.allow_tf32 = False
        print("✓ TF32 disabled for older GPU")
    
    # Check if torch.compile is supported (capability >= 7.0)
    COMPILE_SUPPORTED = gpu_capability_major >= 7
    if not COMPILE_SUPPORTED:
        print("⚠ torch.compile not supported on this GPU (capability < 7.0)")
        # Suppress torch.compile errors
        import torch._dynamo
        torch._dynamo.config.suppress_errors = True
else:
    COMPILE_SUPPORTED = False

# The defined class from previous
class CompetitionMetric:
    def __init__(self):
        self.bfrb_gestures = [
            'Forehead - pull hairline',
            'Neck - pinch skin',
            'Forehead - scratch',
            'Eyelash - pull hair',
            'Eyebrow - pull hair',
            'Neck - scratch',
            'Above ear - pull hair',
            'Cheek - pinch skin',
        ]

    def calculate_hierarchical_f1(self, true_df, pred_df):
        from sklearn.metrics import f1_score
        y_true = true_df['gesture'].values
        y_pred = pred_df['gesture'].values

        # Level 1: BFRB vs non-BFRB (binary)
        y_true_level1 = [1 if g in self.bfrb_gestures else 0 for g in y_true]
        y_pred_level1 = [1 if g in self.bfrb_gestures else 0 for g in y_pred]
        f1_level1 = f1_score(y_true_level1, y_pred_level1, average='binary')

        # Level 2: Specific gestures (macro F1 on all classes)
        f1_level2 = f1_score(y_true, y_pred, average='macro')

        # Hierarchical F1: average of levels
        return (f1_level1 + f1_level2) / 2

# Configuration
TRAIN = True  # Set to False for inference only
RAW_DIR = Path("/kaggle/input/cmi-detect-behavior-with-sensor-data")
PRETRAINED_DIR = Path("/kaggle/input/cmi3-models-p")  # Used when TRAIN=False
EXPORT_DIR = Path("./")  # Artefacts saved here
BATCH_SIZE = 512  # Increased for better GPU utilization
PAD_PERCENTILE = 100
maxlen = PAD_PERCENTILE
LR_INIT = 1e-3
WD = 3e-3
PATIENCE = 40
FOLDS = 5
random_state = 42
epochs_warmup = 20
warmup_lr_init = 1.822126131809773e-05
lr_min = 3.810323058740104e-09
USE_AMP = True
ACCUM_STEPS = 1  # Reduced since we increased batch size

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"▶ Imports ready · PyTorch {torch.__version__} · Device: {device}")
if torch.cuda.is_available():
    print(f"▶ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")

# Global mean/std for normalization - Move to GPU immediately
fft_bins = 10
extra_channels = 6 * fft_bins
base_mean = torch.tensor([
    0, 0, 0, 0, 0, 0, 9.0319e-03, 1.0849e+00, -2.6186e-03, 3.7651e-03,
    -5.3660e-03, -2.8177e-03, 1.3318e-03, -1.5876e-04, 6.3495e-01,
    6.2877e-01, 6.0607e-01, 6.2142e-01, 6.3808e-01, 6.5420e-01,
    7.4102e-03, -3.4159e-03, -7.5237e-03, -2.6034e-02, 2.9704e-02,
    -3.1546e-02, -2.0610e-03, -4.6986e-03, -4.7216e-03, -2.6281e-02,
    1.5799e-02, 1.0016e-02
], dtype=torch.float32, device=device).view(1, -1, 1)

base_std = torch.tensor([
    1, 1, 1, 1, 1, 1, 0.2067, 0.8583, 0.3162,
    0.2668, 0.2917, 0.2341, 0.3023, 0.3281, 1.0264, 0.8838, 0.8686, 1.0973,
    1.0267, 0.9018, 0.4658, 0.2009, 0.2057, 1.2240, 0.9535, 0.6655, 0.2941,
    0.3421, 0.8156, 0.6565, 1.1034, 1.5577
], dtype=torch.float32, device=device).view(1, -1, 1) + 1e-8

# Extend for FFT channels
mean = torch.cat([base_mean, torch.zeros(1, extra_channels, 1, device=device)], dim=1)
std = torch.cat([base_std, torch.ones(1, extra_channels, 1, device=device)], dim=1)

print("✓ GPU tensors initialized")

In [None]:
# Cell 2: Define Model Classes and Augmentation (GPU Optimized)

class ImuFeatureExtractor(nn.Module):
    def __init__(self, fs=100., add_quaternion=False, fft_bins=10):
        super().__init__()
        self.fs = fs
        self.add_quaternion = add_quaternion
        self.fft_bins = fft_bins

        k = 15
        # Pre-computed filter coefficients moved to GPU
        self.lpf = nn.Conv1d(6, 6, kernel_size=k, padding=k//2, groups=6, bias=False)
        nn.init.kaiming_uniform_(self.lpf.weight, a=math.sqrt(5))

        self.lpf_acc = nn.Conv1d(3, 3, k, padding=k//2, groups=3, bias=False)
        self.lpf_gyro = nn.Conv1d(3, 3, k, padding=k//2, groups=3, bias=False)
        
        # Pre-allocate FFT workspace
        self.register_buffer('fft_workspace', torch.zeros(1, 6, fft_bins))

    def forward(self, imu):
        B, C, T = imu.shape
        acc = imu[:, 0:3, :]
        gyro = imu[:, 3:6, :]

        # Basic features (vectorized operations)
        acc_mag = torch.norm(acc, dim=1, keepdim=True)
        gyro_mag = torch.norm(gyro, dim=1, keepdim=True)
        
        # Use diff for jerk calculation (more efficient)
        jerk = F.pad(torch.diff(acc, dim=-1), (1, 0))
        gyro_delta = F.pad(torch.diff(gyro, dim=-1), (1, 0))
        
        acc_pow = acc.pow(2)
        gyro_pow = gyro.pow(2)
        
        acc_lpf = self.lpf_acc(acc)
        acc_hpf = acc - acc_lpf
        gyro_lpf = self.lpf_gyro(gyro)
        gyro_hpf = gyro - gyro_lpf

        # Optimized FFT features
        acc_fft = torch.abs(torch.fft.fft(acc, dim=-1))[:, :, :self.fft_bins]
        gyro_fft = torch.abs(torch.fft.fft(gyro, dim=-1))[:, :, :self.fft_bins]

        # More efficient reshaping and expansion
        acc_fft = acc_fft.flatten(1, 2).unsqueeze(-1).expand(-1, -1, T)
        gyro_fft = gyro_fft.flatten(1, 2).unsqueeze(-1).expand(-1, -1, T)

        features = [
            acc, gyro, acc_mag, gyro_mag, jerk, gyro_delta,
            acc_pow, gyro_pow, acc_lpf, acc_hpf, gyro_lpf, gyro_hpf,
            acc_fft, gyro_fft
        ]
        return torch.cat(features, dim=1)

class SEBlock(nn.Module):
    def __init__(self, channels, reduction=8):
        super().__init__()
        self.squeeze = nn.AdaptiveAvgPool1d(1)
        self.excitation = nn.Sequential(
            nn.Linear(channels, channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channels // reduction, channels, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _ = x.size()
        y = self.squeeze(x).view(b, c)
        y = self.excitation(y).view(b, c, 1)
        return x * y

class ResidualSECNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, pool_size=2, dropout=0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2, bias=False)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2, bias=False)
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.se = SEBlock(out_channels)
        
        self.shortcut = nn.Sequential()
        if in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, 1, bias=False),
                nn.BatchNorm1d(out_channels)
            )
        
        self.pool = nn.MaxPool1d(pool_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        shortcut = self.shortcut(x)
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = self.se(out)
        out += shortcut
        out = F.relu(out)
        out = self.pool(out)
        out = self.dropout(out)
        return out

class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attention = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        scores = torch.tanh(self.attention(x))
        weights = F.softmax(scores.squeeze(-1), dim=1)
        context = torch.sum(x * weights.unsqueeze(-1), dim=1)
        return context

class TwoBranchModel(nn.Module):
    def __init__(self, pad_len, imu_dim_raw, tof_dim, n_classes, 
                 dropouts=[0.3, 0.3, 0.3, 0.3, 0.4, 0.5, 0.3], 
                 feature_engineering=True, fft_bins=10, **kwargs):
        super().__init__()
        self.feature_engineering = feature_engineering
        
        if feature_engineering:
            self.imu_fe = ImuFeatureExtractor(fft_bins=fft_bins, **kwargs)
            imu_dim = 32 + 6 * fft_bins
        else:
            self.imu_fe = nn.Identity()
            imu_dim = imu_dim_raw

        self.imu_dim = imu_dim
        self.tof_dim = tof_dim
        self.fir_nchan = 7

        # FIR filter coefficients as buffer (moved to GPU automatically)
        numtaps = 33
        fir_coef = firwin(numtaps, cutoff=1.0, fs=10.0, pass_zero=False)
        fir_kernel = torch.tensor(fir_coef, dtype=torch.float32).view(1, 1, -1)
        fir_kernel = fir_kernel.repeat(7, 1, 1)
        self.register_buffer("fir_kernel", fir_kernel)

        # Branches with optimized dimensions
        self.imu_block1 = ResidualSECNNBlock(imu_dim, 64, 3, dropout=dropouts[0])
        self.imu_block2 = ResidualSECNNBlock(64, 128, 5, dropout=dropouts[1])

        self.tof_conv1 = nn.Conv1d(tof_dim, 64, 3, padding=1, bias=False)
        self.tof_bn1 = nn.BatchNorm1d(64)
        self.tof_pool1 = nn.MaxPool1d(2)
        self.tof_drop1 = nn.Dropout(dropouts[2])
        self.tof_conv2 = nn.Conv1d(64, 128, 3, padding=1, bias=False)
        self.tof_bn2 = nn.BatchNorm1d(128)
        self.tof_pool2 = nn.MaxPool1d(2)
        self.tof_drop2 = nn.Dropout(dropouts[3])

        # BiLSTM
        self.bilstm = nn.LSTM(256, 128, bidirectional=True, batch_first=True)
        self.lstm_dropout = nn.Dropout(dropouts[4])

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=256, nhead=8, dim_feedforward=512, 
            dropout=0.1, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)

        # Attention and classification
        self.attention = AttentionLayer(256)
        self.dense1 = nn.Linear(256, 256, bias=False)
        self.bn_dense1 = nn.BatchNorm1d(256)
        self.drop1 = nn.Dropout(dropouts[5])
        self.dense2 = nn.Linear(256, 128, bias=False)
        self.bn_dense2 = nn.BatchNorm1d(128)
        self.drop2 = nn.Dropout(dropouts[6])
        self.classifier = nn.Linear(128, n_classes)

    def forward(self, x):
        # Efficient tensor operations
        imu = x[:, :, :self.fir_nchan].transpose(1, 2)
        tof = x[:, :, self.fir_nchan:].transpose(1, 2)

        # Feature extraction
        imu = self.imu_fe(imu)
        
        # Apply FIR filter
        filtered = F.conv1d(
            imu[:, :self.fir_nchan, :],
            self.fir_kernel,
            padding=self.fir_kernel.shape[-1] // 2,
            groups=self.fir_nchan,
        )
        imu = torch.cat([filtered, imu[:, self.fir_nchan:, :]], dim=1)
        
        # Normalization (already on GPU)
        imu = (imu - mean) / std

        # Process branches
        x1 = self.imu_block1(imu)
        x1 = self.imu_block2(x1)
        
        x2 = F.relu(self.tof_bn1(self.tof_conv1(tof)))
        x2 = self.tof_drop1(self.tof_pool1(x2))
        x2 = F.relu(self.tof_bn2(self.tof_conv2(x2)))
        x2 = self.tof_drop2(self.tof_pool2(x2))

        # Merge and process with RNN/Transformer
        merged = torch.cat([x1, x2], dim=1).transpose(1, 2)
        
        lstm_out, _ = self.bilstm(merged)
        lstm_out = self.lstm_dropout(lstm_out)
        
        # Transformer
        trans_out = self.transformer_encoder(lstm_out)
        
        # Attention and classification
        attended = self.attention(trans_out)
        
        x = F.relu(self.bn_dense1(self.dense1(attended)))
        x = self.drop1(x)
        x = F.relu(self.bn_dense2(self.dense2(x)))
        x = self.drop2(x)
        
        logits = self.classifier(x)
        return logits

# GPU-optimized augmentation
class Augment:
    def __init__(self, p_jitter=0.8, sigma=0.02, scale_range=[0.9,1.1],
                 p_dropout=0.3, p_moda=0.5, drift_std=0.005, drift_max=0.25,
                 p_time_warp=0.3, warp_factor=0.1, p_freq_noise=0.3, freq_sigma=0.01):
        self.p_jitter = p_jitter
        self.sigma = sigma
        self.scale_min, self.scale_max = scale_range
        self.p_dropout = p_dropout
        self.p_moda = p_moda
        self.drift_std = drift_std
        self.drift_max = drift_max
        self.p_time_warp = p_time_warp
        self.warp_factor = warp_factor
        self.p_freq_noise = p_freq_noise
        self.freq_sigma = freq_sigma

    def time_warp(self, x):
        t = np.arange(x.shape[0])
        warp = np.random.normal(1, self.warp_factor, size=len(t))
        warp = np.cumsum(warp)
        warp = (warp - warp.min()) / (warp.max() - warp.min()) * (len(t) - 1)
        return np.interp(t, warp, x)

    def freq_noise(self, x):
        freq = fft(x, axis=0)
        noise = np.random.normal(0, self.freq_sigma, freq.shape)
        freq += noise
        return np.real(np.fft.ifft(freq, axis=0))

    def jitter_scale(self, x: np.ndarray) -> np.ndarray:
        noise = np.random.randn(*x.shape) * self.sigma
        scale = np.random.uniform(self.scale_min, self.scale_max, size=(1, x.shape[1]))
        return (x + noise) * scale

    def sensor_dropout(self, x: np.ndarray, imu_dim: int) -> np.ndarray:
        if random.random() < self.p_dropout:
            x[:, imu_dim:] = 0.0
        return x

    def motion_drift(self, x: np.ndarray, imu_dim: int) -> np.ndarray:
        T = x.shape[0]
        drift = np.cumsum(np.random.normal(scale=self.drift_std, size=(T, 1)), axis=0)
        drift = np.clip(drift, -self.drift_max, self.drift_max)
        x[:, :6] += drift
        if imu_dim > 6:
            x[:, 6:imu_dim] += drift
        return x

    def __call__(self, x: np.ndarray, imu_dim: int) -> np.ndarray:
        if random.random() < self.p_jitter:
            x = self.jitter_scale(x)
        if random.random() < self.p_moda:
            x = self.motion_drift(x, imu_dim)
        if random.random() < self.p_time_warp:
            for col in range(x.shape[1]):
                x[:, col] = self.time_warp(x[:, col])
        if random.random() < self.p_freq_noise:
            for col in range(x.shape[1]):
                x[:, col] = self.freq_noise(x[:, col])
        x = self.sensor_dropout(x, imu_dim)
        return x

# Utility classes
class EarlyStopping:
    def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1
        
        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False

    def save_checkpoint(self, model):
        self.best_weights = {k: v.cpu().clone() for k, v in model.state_dict().items()}

class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay
        self.shadow = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()

    def update(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad:
                new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def apply_shadow(self, model):
        self.backup = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.backup[name] = param.data.clone()
                param.data = self.shadow[name]

    def restore(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad and name in self.backup:
                param.data = self.backup[name]

def set_seed(seed: int = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
# Cell 3: Data Handling Functions (GPU Optimized)

def pad_sequences_torch(sequences, maxlen, padding='post', truncating='post', value=0.0):
    """Optimized padding function"""
    result = []
    for seq in sequences:
        if len(seq) >= maxlen:
            if truncating == 'post':
                seq = seq[:maxlen]
            else:
                seq = seq[-maxlen:]
        else:
            pad_len = maxlen - len(seq)
            if padding == 'post':
                seq = np.concatenate([seq, np.full((pad_len, seq.shape[1]), value)])
            else:
                seq = np.concatenate([np.full((pad_len, seq.shape[1]), value), seq])
        result.append(seq)
    return np.array(result, dtype=np.float32)

def preprocess_sequence(df_seq: pd.DataFrame, feature_cols: list, scaler: StandardScaler):
    mat = df_seq[feature_cols].ffill().bfill().fillna(0).values
    return scaler.transform(mat).astype('float32')

class CMI3Dataset(Dataset):
    def __init__(self, X_list, y_list, maxlen, mode="train", imu_dim=7, augment=None):
        self.X_list = X_list
        self.mode = mode
        self.y_list = y_list
        self.maxlen = maxlen
        self.imu_dim = imu_dim
        self.augment = augment

    def pad_sequences_torch(self, seq, maxlen, padding='post', truncating='post', value=0.0):
        if seq.shape[0] >= maxlen:
            if truncating == 'post':
                seq = seq[:maxlen]
            else:
                seq = seq[-maxlen:]
        else:
            pad_len = maxlen - seq.shape[0]
            if padding == 'post':
                seq = np.concatenate([seq, np.full((pad_len, seq.shape[1]), value)])
            else:
                seq = np.concatenate([np.full((pad_len, seq.shape[1]), value), seq])
        return seq

    def __getitem__(self, index):
        X = self.X_list[index].copy()  # Avoid modifying original data
        y = self.y_list[index]
        
        if self.mode == "train" and self.augment is not None:
            X = self.augment(X, self.imu_dim)
        
        X = self.pad_sequences_torch(X, self.maxlen, 'pre', 'pre')
        return torch.FloatTensor(X), torch.FloatTensor(y)

    def __len__(self):
        return len(self.X_list)

# Feature engineering functions (unchanged but mentioned for completeness)
def remove_gravity_from_acc(acc_data, rot_data):
    if isinstance(acc_data, pd.DataFrame):
        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values
    else:
        acc_values = acc_data

    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    gravity_world = np.array([0, 0, 9.81])

    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i, :] = acc_values[i, :] 
            continue

        try:
            rotation = R.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
             linear_accel[i, :] = acc_values[i, :]
             
    return linear_accel

def calculate_angular_velocity_from_quat(rot_data, time_delta=1/200):
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_vel = np.zeros((num_samples, 3))

    for i in range(num_samples - 1):
        q_t = quat_values[i]
        q_t_plus_dt = quat_values[i+1]

        if np.all(np.isnan(q_t)) or np.all(np.isclose(q_t, 0)) or \
           np.all(np.isnan(q_t_plus_dt)) or np.all(np.isclose(q_t_plus_dt, 0)):
            continue

        try:
            rot_t = R.from_quat(q_t)
            rot_t_plus_dt = R.from_quat(q_t_plus_dt)
            delta_rot = rot_t.inv() * rot_t_plus_dt
            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta
        except ValueError:
            pass
            
    return angular_vel

def calculate_angular_distance(rot_data):
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_dist = np.zeros(num_samples)

    for i in range(num_samples - 1):
        q1 = quat_values[i]
        q2 = quat_values[i+1]

        if np.all(np.isnan(q1)) or np.all(np.isclose(q1, 0)) or \
           np.all(np.isnan(q2)) or np.all(np.isclose(q2, 0)):
            angular_dist[i] = 0
            continue
        try:
            r1 = R.from_quat(q1)
            r2 = R.from_quat(q2)
            relative_rotation = r1.inv() * r2
            angle = np.linalg.norm(relative_rotation.as_rotvec())
            angular_dist[i] = angle
        except ValueError:
            angular_dist[i] = 0
            
    return angular_dist

In [None]:
# Cell 4: Load Dataset if TRAIN (GPU Optimized)
if TRAIN:
    print("TRAIN MODE – loading dataset …")
    df = pd.read_csv(RAW_DIR / "train.csv")
    le = LabelEncoder()
    df['gesture_int'] = le.fit_transform(df['gesture'])
    np.save(EXPORT_DIR / "gesture_classes.npy", le.classes_)
    print(f"✓ Loaded {len(df)} samples with {len(le.classes_)} classes")

    meta_cols = {'gesture', 'gesture_int', 'sequence_type', 'behavior', 'orientation',
                 'row_id', 'subject', 'phase', 'sequence_id', 'sequence_counter'}
    feature_cols = [c for c in df.columns if c not in meta_cols]

    imu_cols = [c for c in feature_cols if not (c.startswith('thm_') or c.startswith('tof_'))]
    tof_cols = [c for c in feature_cols if c.startswith('thm_') or c.startswith('tof_')]
    
    print(f"✓ IMU features: {len(imu_cols)}, TOF features: {len(tof_cols)}")

    # Fit scaler and save
    scaler = StandardScaler().fit(df[feature_cols].ffill().bfill().fillna(0).values)
    joblib.dump(scaler, EXPORT_DIR / "scaler.pkl")

    # Process sequences
    seq_gp = df.groupby('sequence_id')
    X_list, y_list, id_list = [], [], []
    
    print("Processing sequences...")
    for seq_id, seq in tqdm(seq_gp, desc="Processing sequences"):
        mat = preprocess_sequence(seq, feature_cols, scaler)
        X_list.append(mat)
        y_list.append(seq['gesture_int'].iloc[0])
        id_list.append(seq_id)

    pad_len = PAD_PERCENTILE
    np.save(EXPORT_DIR / "sequence_maxlen.npy", pad_len)
    np.save(EXPORT_DIR / "feature_cols.npy", np.array(feature_cols))
    
    # Convert to arrays
    id_list = np.array(id_list)
    X_list_all = pad_sequences_torch(X_list, maxlen=pad_len, padding='pre', truncating='pre')
    y_list_all = np.eye(len(le.classes_))[y_list].astype(np.float32)
    
    print(f"✓ Data prepared: {X_list_all.shape[0]} sequences, max length {pad_len}")

    # Initialize augmenter
    augmenter = Augment(
        p_jitter=0.9844818619033621, 
        sigma=0.03291295776089293, 
        scale_range=(0.7542342630597011, 1.1625052821731077),
        p_dropout=0.41782786013520684, 
        p_moda=0.3910622476959722, 
        drift_std=0.0040285239353308015, 
        drift_max=0.3929358950258158,
        p_time_warp=0.3, 
        warp_factor=0.1, 
        p_freq_noise=0.3, 
        freq_sigma=0.01
    )

In [None]:
# Cell 5: Training Loop (GPU Optimized)
if TRAIN:
    EPOCHS = 125
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=random_state)
    models = []
    fold_scores = []

    # Set seed for reproducibility
    set_seed(random_state)

    for fold, (train_idx, val_idx) in enumerate(skf.split(id_list, np.argmax(y_list_all, axis=1))):
        print(f"\n{'='*60}")
        print(f"Training Fold {fold + 1}/{FOLDS}")
        print(f"{'='*60}")
        
        # Split data
        train_list = X_list_all[train_idx]
        train_y_list = y_list_all[train_idx]
        val_list = X_list_all[val_idx]
        val_y_list = y_list_all[val_idx]
        
        print(f"Train samples: {len(train_list)}, Val samples: {len(val_list)}")

        # Create datasets with optimized data loading
        train_dataset = CMI3Dataset(train_list, train_y_list, maxlen, 
                                  mode="train", imu_dim=len(imu_cols), augment=augmenter)
        
        # Adjust num_workers based on GPU capability
        num_workers = 8 if COMPILE_SUPPORTED else 4
        
        train_loader = DataLoader(
            train_dataset, 
            batch_size=BATCH_SIZE, 
            shuffle=True, 
            num_workers=num_workers,
            drop_last=True, 
            pin_memory=True,
            persistent_workers=True if num_workers > 0 else False
        )

        val_dataset = CMI3Dataset(val_list, val_y_list, maxlen, mode="val")
        val_loader = DataLoader(
            val_dataset, 
            batch_size=BATCH_SIZE, 
            shuffle=False, 
            num_workers=num_workers, 
            drop_last=True, 
            pin_memory=True,
            persistent_workers=True if num_workers > 0 else False
        )

        # Initialize model
        model = TwoBranchModel(
            maxlen, len(imu_cols), len(tof_cols), len(le.classes_), fft_bins=fft_bins
        ).to(device)
        
        # Compile model for better performance (only if supported)
        if COMPILE_SUPPORTED and hasattr(torch, 'compile'):
            try:
                model = torch.compile(model)
                print(f"✓ Model compiled for fold {fold+1}")
            except Exception as e:
                print(f"⚠ Compilation failed for fold {fold+1}, using eager mode: {e}")
        else:
            print(f"✓ Using eager mode for fold {fold+1}")
        
        # Initialize training components
        ema = EMA(model, decay=0.999)
        
        # Use fused optimizer only if CUDA capability >= 7.0
        if COMPILE_SUPPORTED:
            optimizer = Adam(model.parameters(), lr=LR_INIT, weight_decay=WD, fused=True)
        else:
            optimizer = Adam(model.parameters(), lr=LR_INIT, weight_decay=WD)

        # Learning rate scheduler
        steps_per_epoch = len(train_loader)
        warmup = epochs_warmup * steps_per_epoch
        nsteps = EPOCHS * steps_per_epoch
        scheduler = CosineLRScheduler(
            optimizer, 
            warmup_t=warmup, 
            warmup_lr_init=warmup_lr_init, 
            warmup_prefix=True,
            t_initial=(nsteps - warmup), 
            lr_min=lr_min
        )

        early_stopping = EarlyStopping(patience=PATIENCE, restore_best_weights=True)
        scaler_amp = GradScaler() if USE_AMP else None

        # Training variables
        best_val_score = 0.0
        i_scheduler = 0

        print("Starting training...")
        for epoch in range(EPOCHS):
            # Training phase
            model.train()
            train_preds = []
            train_targets = []
            train_loss = 0.0
            
            pbar = tqdm(train_loader, desc=f"Fold {fold+1}, Epoch {epoch+1}/{EPOCHS}")
            for batch_idx, (X, y) in enumerate(pbar):
                X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
                
                optimizer.zero_grad(set_to_none=True)
                
                if USE_AMP:
                    with autocast():
                        logits = model(X)
                        loss = F.cross_entropy(logits, y.argmax(dim=1))
                    
                    scaler_amp.scale(loss).backward()
                    
                    if (batch_idx + 1) % ACCUM_STEPS == 0:
                        scaler_amp.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                        scaler_amp.step(optimizer)
                        scaler_amp.update()
                else:
                    logits = model(X)
                    loss = F.cross_entropy(logits, y.argmax(dim=1))
                    loss.backward()
                    
                    if (batch_idx + 1) % ACCUM_STEPS == 0:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                        optimizer.step()

                ema.update(model)
                
                # Collect predictions
                with torch.no_grad():
                    train_preds.extend(logits.argmax(dim=1).cpu().numpy())
                    train_targets.extend(y.argmax(dim=1).cpu().numpy())
                
                scheduler.step(i_scheduler)
                i_scheduler += 1
                train_loss += loss.item()
                
                # Update progress bar
                pbar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'lr': f'{optimizer.param_groups[0]["lr"]:.2e}'
                })

            # Validation phase
            model.eval()
            ema.apply_shadow(model)  # Use EMA weights for validation
            
            val_preds = []
            val_targets = []
            val_loss = 0.0
            
            with torch.no_grad():
                for X, y in tqdm(val_loader, desc="Validation", leave=False):
                    # Augmented validation with sensor dropout
                    half = BATCH_SIZE // 2
                    x_front = X[:half].to(device, non_blocking=True)
                    x_back = X[half:].clone()
                    x_back[:, :, 7:] = 0.0  # Sensor dropout
                    x_back = x_back.to(device, non_blocking=True)
                    X_val = torch.cat([x_front, x_back], dim=0)
                    y = y.to(device, non_blocking=True)
                    
                    if USE_AMP:
                        with autocast():
                            logits = model(X_val)
                            loss = F.cross_entropy(logits, y.argmax(dim=1))
                    else:
                        logits = model(X_val)
                        loss = F.cross_entropy(logits, y.argmax(dim=1))
                    
                    val_preds.extend(logits.argmax(dim=1).cpu().numpy())
                    val_targets.extend(y.argmax(dim=1).cpu().numpy())
                    val_loss += loss.item()

            ema.restore(model)  # Restore original weights

            # Calculate metrics
            train_acc = CompetitionMetric().calculate_hierarchical_f1(
                pd.DataFrame({'gesture': le.classes_[train_targets]}),
                pd.DataFrame({'gesture': le.classes_[train_preds]})
            )
            val_acc = CompetitionMetric().calculate_hierarchical_f1(
                pd.DataFrame({'gesture': le.classes_[val_targets]}),
                pd.DataFrame({'gesture': le.classes_[val_preds]})
            )
            
            train_loss /= len(train_loader)
            val_loss /= len(val_loader)
            
            print(f"Epoch {epoch+1:3d} | "
                  f"Train Loss: {train_loss:.4f} | Train F1: {train_acc:.4f} | "
                  f"Val Loss: {val_loss:.4f} | Val F1: {val_acc:.4f}")
            
            # Early stopping check
            if early_stopping(val_loss, model):
                print(f"Early stopping at epoch {epoch+1}")
                break
            
            # Save best model
            if val_acc > best_val_score:
                best_val_score = val_acc
                ema.apply_shadow(model)  # Use EMA weights for saving
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'model_config': {
                        'pad_len': maxlen,
                        'imu_dim_raw': len(imu_cols),
                        'tof_dim': len(tof_cols),
                        'n_classes': len(le.classes_),
                        'fft_bins': fft_bins
                    },
                    'val_score': val_acc,
                    'epoch': epoch
                }, EXPORT_DIR / f"gesture_two_branch_fold{fold}.pth")
                ema.restore(model)

        fold_scores.append(best_val_score)
        models.append(model)
        print(f"Fold {fold + 1} completed | Best Val F1: {best_val_score:.4f}")
        
        # Clear cache
        torch.cuda.empty_cache()

    # Final results
    mean_score = np.mean(fold_scores)
    std_score = np.std(fold_scores)
    print(f"\n{'='*60}")
    print(f"Training Complete!")
    print(f"Cross-validation scores: {fold_scores}")
    print(f"Mean CV Score: {mean_score:.4f} ± {std_score:.4f}")
    print(f"Models saved in: {EXPORT_DIR}")
    print(f"{'='*60}")

In [None]:
# Cell 6: Prepare Models for Inference (Using Saved Models)
print("Preparing models for inference...")

if TRAIN:
    model_dir = EXPORT_DIR
    print("Using artefacts from training session")
else:
    model_dir = PRETRAINED_DIR
    print("INFERENCE MODE – loading artefacts from", model_dir)

# Load saved artifacts consistently
feature_cols = np.load(model_dir / "feature_cols.npy", allow_pickle=True).tolist()
pad_len = int(np.load(model_dir / "sequence_maxlen.npy"))
scaler = joblib.load(model_dir / "scaler.pkl")
gesture_classes = np.load(model_dir / "gesture_classes.npy", allow_pickle=True)

print(f"✓ Loaded feature columns: {len(feature_cols)}")
print(f"✓ Sequence max length: {pad_len}")
print(f"✓ Classes: {len(gesture_classes)}")

# Separate IMU and TOF columns
imu_cols = [c for c in feature_cols if not (c.startswith('thm_') or c.startswith('tof_'))]
tof_cols = [c for c in feature_cols if c.startswith('thm_') or c.startswith('tof_')]

# Allowlist the numpy scalar for safe loading
import torch.serialization
import numpy.core.multiarray
torch.serialization.add_safe_globals([numpy.core.multiarray.scalar])

# Load trained models
MODELS = [f'gesture_two_branch_fold{i}.pth' for i in range(5)]
models = []

print("Loading trained models...")
for i, model_path in enumerate(MODELS):
    full_path = model_dir / model_path
    if full_path.exists():
        checkpoint = torch.load(full_path, map_location=device)
        
        # Extract model configuration
        config = checkpoint['model_config']
        model = TwoBranchModel(**config).to(device)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()
        
        models.append(model)
        print(f"✓ Loaded fold {i} model (Val F1: {checkpoint.get('val_score', 'N/A'):.4f})")
    else:
        print(f"✗ Model {model_path} not found")

if len(models) == 0:
    raise FileNotFoundError("No trained models found in the directory")

print(f"✓ {len(models)} models loaded successfully")

# Enhanced prediction function with proper ensemble
def predict(sequence: pl.DataFrame, demographics: pl.DataFrame) -> str:
    """
    Enhanced prediction function with proper model ensemble
    """
    global models, gesture_classes, scaler, feature_cols, pad_len
    
    # Convert polars to pandas
    df_seq = sequence.to_pandas()
    
    # Preprocess sequence
    mat = preprocess_sequence(df_seq, feature_cols, scaler)
    
    # Pad sequence
    padded = pad_sequences_torch([mat], maxlen=pad_len, padding='post', truncating='post')
    
    # Convert to tensor and move to GPU
    x = torch.FloatTensor(padded).to(device)
    
    # Ensemble prediction
    with torch.no_grad():
        ensemble_probs = None
        
        for model in models:
            model.eval()
            
            if USE_AMP:
                with autocast():
                    logits = model(x)
                    probs = F.softmax(logits, dim=1)
            else:
                logits = model(x)
                probs = F.softmax(logits, dim=1)
            
            if ensemble_probs is None:
                ensemble_probs = probs
            else:
                ensemble_probs += probs
        
        if ensemble_probs is None:
            raise ValueError("No models available for prediction")
        
        ensemble_probs /= len(models)
        predicted_idx = ensemble_probs.argmax(dim=1).item()
    
    return str(gesture_classes[predicted_idx])

In [None]:
# Cell 7: Kaggle API Setup
if  TRAIN:
    print("Setting up Kaggle inference server...")
    try:
        import kaggle_evaluation.cmi_inference_server
        inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)
        
        if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
            print("Running in competition mode...")
            inference_server.serve()
        else:
            print("Running local gateway...")
            inference_server.run_local_gateway(
                data_paths=(
                    '/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv',
                    '/kaggle/input/cmi-detect-behavior-with-sensor-data/test_demographics.csv',
                )
            )
    except ImportError:
        print("Kaggle evaluation module not available - skipping inference server setup")
        print("Models are loaded and predict() function is ready for manual testing")

print("✓ Setup complete!")