In [None]:
import os
import polars as pl
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from src.model import create_model
from src.dataset import *
from src.trainer import create_model_config
from pathlib import Path
import yaml
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# CMI Gesture Recognition - Inference Notebook

This notebook demonstrates how to load and use the trained gesture recognition model for inference.

## Key Updates for Simplified Architecture:
- Uses `create_model()` function instead of direct class instantiation
- Simplified chunk-wise processing with dataset-level chunking
- Clean logit averaging for multi-chunk sequences
- Compatible with focal loss + label smoothing trained models

## Usage:
1. Update the `exp_dir` path to point to your trained model directory
2. Run all cells to load model and test inference
3. The `predict()` function can be used with the Kaggle inference server

In [None]:
# Load model and config
exp_dir = Path('../experiments/cmi_training_20250818_234605')  # Update this path as needed
with open(exp_dir / 'configs' / 'config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Create model config
model_config = create_model_config(
    num_classes=config['model']['num_classes'],
    d_model=config['model']['d_model'],
    hidden_dim=config['model']['hidden_dim'],
    num_heads=config['model']['num_heads'],
    num_layers=config['model']['num_layers'],
    acc_dim=3,  # Standard accelerometer dimensions
    rot_dim=4,  # Standard rotation dimensions  
    thm_dim=5,  # Standard thermal dimensions
    dropout=config['model']['dropout'],
    max_seq_length=config['model'].get('max_seq_length', 5000),
    sequence_processor=config['model']['sequence_processor'],
    tof_backbone=config['model']['tof_backbone']
)

# Initialize model
model = create_model(**model_config)

# Load trained weights
checkpoint = torch.load(exp_dir / 'models/best_model.pt', map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

print("Model loaded successfully")

In [None]:
# Load training data for label encoding setup
dataset_dir = Path('../dataset')
train_sequences_df = pl.read_csv(dataset_dir / 'train.csv')
train_sequences_df = train_sequences_df.fill_null(-1.0).fill_nan(-1.0)
train_demographics_df = pl.read_csv(dataset_dir / 'train_demographics.csv')

# Prepare gesture labels and get label encoder
train_sequences_df, labelencoder, target_gesture_id, non_target_gesture_id = prepare_gesture_labels(train_sequences_df)

# Get gesture classes for mapping predictions back to gesture names
gesture_classes = list(labelencoder.classes_)
print(f"Loaded {len(gesture_classes)} gesture classes")

In [None]:
def smooth_chunk_average(logits):
    for i in range(1, logits.shape[0]-1):
        logits[i, :] = (
            logits[i-1, :] * 0.2 + 
            logits[i, :] * 0.6 + 
            logits[i+1, :] * 0.2
        )
    logits[0, :] = logits[1, :] * 0.2 + logits[0, :] * 0.8
    logits[-1, :] = logits[-2, :] * 0.2 + logits[-1, :] * 0.8
    return logits

In [None]:
# Updated Inference Pipeline Summary:
# 
# Key Changes from Original:
# 1. Uses simplified model architecture with create_model() function
# 2. Uses dataset chunking instead of complex model chunking methods
# 3. Simple logit averaging across chunks for final prediction
# 4. No complex attention-based chunk aggregation needed
# 5. Cleaner, more maintainable code structure
# 
# Model Pipeline:
# Sequence → Dataset chunks → Model forward() → Average logits → Prediction

print("🔄 Updated inference pipeline for simplified architecture")

In [None]:
def predict(data_batch):
    """
    Predict gesture for a single sequence using simplified chunk-wise processing.
    
    Args:
        data_batch: Tuple of (sequence_df, demographics_df) from the inference server
        
    Returns:
        str: Predicted gesture name
    """
    sequence_df, demographics_df = data_batch
    
    # Process sequence into chunks
    sequence_processor = SequenceProcessor()
    sequences = sequence_processor.process_dataframe(
        df=sequence_df, 
        chunk_size=config['data']['chunk_size']
    )
    
    if not sequences:
        return "Text on phone"  # Default prediction if no valid sequences
    
    # Create dataset for this sequence with chunking enabled
    dataset = CMIDataset(
        sequences=sequences, 
        chunk_size=config['data']['chunk_size'],
        use_chunking=True,
        augmentation_config=None  # No augmentation for inference
    )
    
    # Collect predictions from all chunks
    chunk_logits = []
    
    with torch.no_grad():
        for chunk_data in dataset:
            # Get chunk data
            tof_chunk = chunk_data['tof'].unsqueeze(0).to(device)  # (1, chunk_size, 320)
            acc_chunk = chunk_data['acc'].unsqueeze(0).to(device)  # (1, chunk_size, 3)
            rot_chunk = chunk_data['rot'].unsqueeze(0).to(device)  # (1, chunk_size, 4)
            thm_chunk = chunk_data['thm'].unsqueeze(0).to(device)  # (1, chunk_size, 5)
            
            # Simple forward pass through model
            logits = model(tof_chunk, acc_chunk, rot_chunk, thm_chunk)  # (1, num_classes)
            chunk_logits.append(logits.squeeze(0))  # (num_classes,)
    
    if not chunk_logits:
        return "Text on phone"  # Default prediction
    
    # Average logits across all chunks (simple aggregation)
    if len(chunk_logits) == 1:
        final_logits = chunk_logits[0]
    else:
        stacked_logits = torch.stack(chunk_logits)  # (num_chunks, num_classes)
        final_logits = stacked_logits.mean(dim=0)   # (num_classes,)
    
    # Get predicted class
    predicted_class_id = final_logits.argmax().item()
    
    # Map class ID back to gesture name
    predicted_gesture = gesture_classes[predicted_class_id]
    
    return predicted_gesture

print("Predict function with simplified chunk processing defined")

In [None]:
# Test inference on training data (optional evaluation)
grouped = train_sequences_df.group_by("sequence_id")
predictions = []
true_labels = []

# Limit to first 10 sequences for quick testing (remove limit for full evaluation)
for i, (sequence_id, sequence) in enumerate(tqdm(grouped)):
    if i >= 10:  # Remove this line for full evaluation
        break
        
    try:
        # Predict gesture for this sequence
        predicted_gesture = predict((sequence, train_demographics_df))
        predictions.append(predicted_gesture)
        true_labels.append(sequence['gesture'][0])
    except Exception as e:
        print(f"Error processing sequence {sequence_id[0]}: {e}")
        # Skip this sequence
        continue

print(f"Processed {len(predictions)} sequences")

In [None]:
# Calculate metrics if we have predictions
if predictions and true_labels:
    from sklearn.metrics import f1_score, accuracy_score
    f1 = f1_score(true_labels, predictions, average='weighted')
    accuracy = accuracy_score(true_labels, predictions)

    print(f"F1 Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    
    # Show some example predictions
    print(f"\nExample predictions:")
    for i in range(min(5, len(predictions))):
        print(f"  True: {true_labels[i]}")
        print(f"  Pred: {predictions[i]}")
        print(f"  Match: {'✓' if true_labels[i] == predictions[i] else '✗'}")
        print()
else:
    print("No valid predictions to evaluate")

In [None]:
# mean(logits)
# F1 Score: 0.7187
# Accuracy: 0.7314

# smooth_chunk_average(logits)


In [None]:
# # Initialize and run the inference server
# import kaggle_evaluation.cmi_inference_server

# inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)

# if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     # Running in Kaggle competition environment
#     inference_server.serve()
# else:
#     # # Running locally for testing
#     inference_server.run_local_gateway(
#         data_paths=(
#             '/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv',
#             '/kaggle/input/cmi-detect-behavior-with-sensor-data/test_demographics.csv',
#         )
#     )

# # Show results if running locally
# if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#         results = pd.read_parquet("submission.parquet")
#         print(results.head())