<a href="https://colab.research.google.com/github/souvikdas1990/Testing/blob/main/MABe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

mabe_mouse_behavior_detection_path = kagglehub.competition_download('MABe-mouse-behavior-detection')

print('Data source import complete.')


In [None]:
import re
import cv2
import gc
import os
import json
import glob
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from sklearn.utils import resample
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from torch.cuda.amp import autocast, GradScaler
import time
from pathlib import Path

In [None]:

tracking_path = '/kaggle/input/MABe-mouse-behavior-detection/train_tracking/'
annotation_path = '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/'
train_csv_path = '/kaggle/input/MABe-mouse-behavior-detection/train.csv'

try:
    train_df = pd.read_csv(train_csv_path)
    print("First 5 rows of train.csv:")
    display(train_df.head())
    print(train_df.shape)
except FileNotFoundError:
    print(f"Error: train.csv not found at {train_csv_path}")
except Exception as e:
    print(f"An error occurred while reading train.csv: {e}")

First 5 rows of train.csv:


Unnamed: 0,lab_id,video_id,mouse1_strain,mouse1_color,mouse1_sex,mouse1_id,mouse1_age,mouse1_condition,mouse2_strain,mouse2_color,...,pix_per_cm_approx,video_width_pix,video_height_pix,arena_width_cm,arena_height_cm,arena_shape,arena_type,body_parts_tracked,behaviors_labeled,tracking_method
0,AdaptableSnail,44566106,CD-1 (ICR),white,male,10.0,8-12 weeks,wireless device,CD-1 (ICR),white,...,16.0,1228,1068,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""head...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut
1,AdaptableSnail,143861384,CD-1 (ICR),white,male,3.0,8-12 weeks,,CD-1 (ICR),white,...,9.7,968,608,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""late...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut
2,AdaptableSnail,209576908,CD-1 (ICR),white,male,7.0,8-12 weeks,,CD-1 (ICR),white,...,16.0,1266,1100,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""late...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut
3,AdaptableSnail,278643799,CD-1 (ICR),white,male,11.0,8-12 weeks,wireless device,CD-1 (ICR),white,...,16.0,1224,1100,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""head...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut
4,AdaptableSnail,351967631,CD-1 (ICR),white,male,14.0,8-12 weeks,,CD-1 (ICR),white,...,16.0,1204,1068,60.0,60.0,square,familiar,"[""body_center"", ""ear_left"", ""ear_right"", ""late...","[""mouse1,mouse2,approach"", ""mouse1,mouse2,atta...",DeepLabCut


(8789, 38)


In [None]:
valid_pairs = []

# Walk over each lab folder in annotation_path
for lab in os.listdir(annotation_path):
    ann_lab_folder = os.path.join(annotation_path, lab)
    if not os.path.isdir(ann_lab_folder):
        continue
    # collect all video_ids (strip .parquet)
    ann_files = [f.replace(".parquet", "") for f in os.listdir(ann_lab_folder) if f.endswith(".parquet")]
    for vid in ann_files:
        valid_pairs.append((lab, vid))

# Build a DataFrame of valid (lab_id, video_id) pairs
valid_df = pd.DataFrame(valid_pairs, columns=["lab_id", "video_id"])

# Filter train_df to only keep rows that appear in valid_df
train_df["video_id"] = train_df["video_id"].astype(str)
valid_df["video_id"] = valid_df["video_id"].astype(str)
train_df = train_df.merge(valid_df, on=["lab_id","video_id"], how="inner")

print("After filtering, train_df shape:", train_df.shape)
print("Unique labs left:", train_df["lab_id"].nunique())


After filtering, train_df shape: (847, 38)
Unique labs left: 19


In [None]:
# ====================================================
# Build merged labeled dataset from tracking + annotation parquet files
# ====================================================

KEEP_NIL_FRAC = 0  # keep 10% of NIL rows

all_chunks = []   # collect per-video DataFrames (beware memory if you keep all)

for _, row in train_df.iterrows():
    lab_id   = row['lab_id']
    video_id = row['video_id']

    tracking_file_path   = os.path.join(tracking_path,   lab_id, f'{video_id}.parquet')
    annotation_file_path = os.path.join(annotation_path, lab_id, f'{video_id}.parquet')

    print(f"Processing Lab ID: {lab_id}, Video ID: {video_id}")

    # --- load tracking ---
    try:
        df_tracking = pd.read_parquet(tracking_file_path)
        # init default labels
        df_tracking['target_id'] = 0
        df_tracking['action'] = "NIL"
    except Exception as e:
        print(f"  Error reading tracking file: {e}")
        continue

    # --- load annotations & stamp agent rows ---
    try:
        df_annotation = pd.read_parquet(annotation_file_path)

        # minimal safety check
        need = {'start_frame','stop_frame','agent_id','target_id','action'}
        if not need.issubset(df_annotation.columns):
            print(f"  Annotation missing cols {need - set(df_annotation.columns)}; skipping labels.")

        else:
            for _, ann in df_annotation.iterrows():
                mask_agent = (
                    (df_tracking['video_frame'] >= ann['start_frame']) &
                    (df_tracking['video_frame'] <= ann['stop_frame']) &
                    (df_tracking['mouse_id'] == ann['agent_id'])
                )
                df_tracking.loc[mask_agent, 'target_id'] = ann['target_id']
                df_tracking.loc[mask_agent, 'action']    = ann['action']

                # (optional) also tag target rows with same action:
                # if pd.notna(ann['target_id']):
                #     mask_target = (
                #         (df_tracking['video_frame'] >= ann['start_frame']) &
                #         (df_tracking['video_frame'] <= ann['stop_frame']) &
                #         (df_tracking['mouse_id'] == ann['target_id'])
                #     )
                #     df_tracking.loc[mask_target, 'action'] = ann['action']
                #     df_tracking.loc[mask_target, 'target_id'] = ann['agent_id']

    except Exception as e:
        print(f"  Error reading annotation file: {e}")

    # --- add metadata columns first ---
    df_tracking['lab_id'] = lab_id
    df_tracking['video_id'] = video_id
    cols = ['lab_id', 'video_id'] + [c for c in df_tracking.columns if c not in ('lab_id','video_id')]
    df_tracking = df_tracking[cols]

    # --- drop 90% of NIL rows (keep only 10%) per video ---
    nil_mask = (df_tracking['action'] == 'NIL')

    # per-video deterministic RNG seed
    seed = (hash((str(lab_id), str(video_id))) & 0xFFFFFFFF)
    rng = np.random.RandomState(seed)

    # vectorized keep mask: keep all positives + 10% of NILs
    keep_nil_mask = nil_mask & (rng.rand(len(df_tracking)) < KEEP_NIL_FRAC)
    pos_mask = ~nil_mask
    keep_mask = pos_mask | keep_nil_mask

    df_tracking = df_tracking.loc[keep_mask].reset_index(drop=True)

    # append reduced per-video chunk
    all_chunks.append(df_tracking)

# --- concatenate all reduced chunks ---
df_merged = pd.concat(all_chunks, ignore_index=True)

print(f"Merged (reduced) dataset shape = {df_merged.shape}")

# Optionally save
# out_path = "/kaggle/working/merged_dataset.parquet"
# df_merged.to_parquet(out_path, index=False)


Processing Lab ID: AdaptableSnail, Video ID: 44566106
Processing Lab ID: AdaptableSnail, Video ID: 143861384
Processing Lab ID: AdaptableSnail, Video ID: 209576908
Processing Lab ID: AdaptableSnail, Video ID: 278643799
Processing Lab ID: AdaptableSnail, Video ID: 351967631
Processing Lab ID: AdaptableSnail, Video ID: 355542626
Processing Lab ID: AdaptableSnail, Video ID: 678426900
Processing Lab ID: AdaptableSnail, Video ID: 705948978
Processing Lab ID: AdaptableSnail, Video ID: 878123481
Processing Lab ID: AdaptableSnail, Video ID: 1212811043
Processing Lab ID: AdaptableSnail, Video ID: 1260392287
Processing Lab ID: AdaptableSnail, Video ID: 1351098077
Processing Lab ID: AdaptableSnail, Video ID: 1408652858
Processing Lab ID: AdaptableSnail, Video ID: 1596473327
Processing Lab ID: AdaptableSnail, Video ID: 1643942986
Processing Lab ID: AdaptableSnail, Video ID: 1717182687
Processing Lab ID: AdaptableSnail, Video ID: 2078515636
Processing Lab ID: BoisterousParrot, Video ID: 402963089
P

In [None]:
# ====================================================
# Cell: Load ONLY tracking rows that match df_merged
#        (same lab_id, video_id, video_frame AND mouse_id == target_id)
# ====================================================
# Inputs assumed:
#   - train_df with columns ['lab_id','video_id']
#   - tracking_path root containing <lab_id>/<video_id>.parquet
#   - df_merged with columns ['lab_id','video_id','video_frame','target_id']
# Outputs:
#   - df_full_tracking_all: concatenation of ONLY the matching tracking rows
# ====================================================

# 1) Build the (lab_id, video_id, video_frame, target_id) key set from df_merged
if 'target_id' not in df_merged.columns:
    raise ValueError("df_merged must contain 'target_id' column.")

keys_all = (
    df_merged.loc[df_merged['target_id'].fillna(0).astype(int) > 0,
                  ['lab_id','video_id','video_frame','target_id']]
    .dropna()
    .drop_duplicates()
    .copy()
)

# normalize dtypes used for joining
keys_all['lab_id']      = keys_all['lab_id'].astype(str)
keys_all['video_id']    = keys_all['video_id']
keys_all['video_frame'] = keys_all['video_frame'].astype(int, errors='ignore')
keys_all['target_id']   = keys_all['target_id'].astype(int, errors='ignore')

all_train_chunks = []
seen = set()  # avoid re-reading duplicates, if any

for _, row in train_df.iterrows():
    lab_id   = str(row['lab_id'])
    video_id = row['video_id']

    # keep only if we actually have any keys for this (lab, video)
    keys_this = keys_all[(keys_all['lab_id'] == lab_id) & (keys_all['video_id'] == video_id)]
    if keys_this.empty:
        continue

    key = (lab_id, video_id)
    if key in seen:
        continue
    seen.add(key)

    tracking_file_path = os.path.join(tracking_path, lab_id, f'{video_id}.parquet')
    try:
        df_full_tracking = pd.read_parquet(tracking_file_path)

        # --- minimal schema normalization ---
        # some schemas may use 'frame' instead of 'video_frame'
        if 'video_frame' not in df_full_tracking.columns and 'frame' in df_full_tracking.columns:
            df_full_tracking = df_full_tracking.rename(columns={'frame': 'video_frame'})

        # enforce dtypes for join
        if 'video_frame' in df_full_tracking.columns:
            df_full_tracking['video_frame'] = df_full_tracking['video_frame'].astype(int, errors='ignore')
        if 'mouse_id' in df_full_tracking.columns:
            df_full_tracking['mouse_id'] = df_full_tracking['mouse_id'].astype(int, errors='ignore')

        # add metadata for traceability (if not already present)
        df_full_tracking['lab_id'] = lab_id
        df_full_tracking['video_id'] = video_id

        # --- build join keys for this video ---
        # need columns: ['video_frame','mouse_id'] where mouse_id == target_id
        join_keys = keys_this[['video_frame','target_id']].rename(columns={'target_id':'mouse_id'}).drop_duplicates()

        # inner join to keep ONLY rows that match (video_frame, mouse_id == target_id)
        df_match = join_keys.merge(
            df_full_tracking,
            on=['video_frame','mouse_id'],
            how='inner'
        )

        if df_match.empty:
            print(f"[INFO] No matching rows for lab={lab_id}, video={video_id}.")
            continue

        # reorder to keep metadata first
        cols = ['lab_id', 'video_id'] + [c for c in df_match.columns if c not in ('lab_id','video_id')]
        df_match = df_match[cols]

        all_train_chunks.append(df_match)
        print(f"Loaded filtered tracking: lab={lab_id}, video={video_id}, shape={df_match.shape}")

    except Exception as e:
        print(f"[WARN] Could not read Train tracking for lab={lab_id}, video={video_id}: {e}")

# Concatenate if any found
if len(all_train_chunks) > 0:
    df_full_tracking_all = pd.concat(all_train_chunks, ignore_index=True)
    print(f"Combined filtered tracking shape: {df_full_tracking_all.shape}")
else:
    df_full_tracking_all = pd.DataFrame()
    print("No matching tracking rows were loaded.")

# Peek
#df_full_tracking_all.head()


Loaded filtered tracking: lab=AdaptableSnail, video=44566106, shape=(220145, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=143861384, shape=(108312, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=209576908, shape=(111930, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=278643799, shape=(231816, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=351967631, shape=(89398, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=355542626, shape=(126721, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=678426900, shape=(92346, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=705948978, shape=(117110, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=878123481, shape=(6944, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=1212811043, shape=(99012, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=1260392287, shape=(38272, 7)
Loaded filtered tracking: lab=AdaptableSnail, video=1351098077, shape=(43761, 7)
Loaded filtered tracking: lab=Ada

In [None]:
# Count the true labels in the original test labels
#print("True label counts in the data used for inference:")
# Use the un-encoded labels before they went into the DataLoader
#print(df_merged['action'].value_counts())
#unique_values_array = df_merged['bodypart'].unique()

#print("Unique values (as a NumPy array):")
#print(unique_values_array)
#print(len(unique_values_array))
#unique_action_array = df_merged['action'].unique()

#print("Unique values (as a NumPy array):")
#print(unique_action_array)
#print(len(unique_action_array))
#print(df_merged['action'].value_counts())

In [None]:
# ====================================================
# Cell: Join df_merged ⟷ df_full_tracking_all and ADD target_x / target_y
#  - Preserves df_merged row count (LEFT JOIN)
#  - Keys: lab_id, video_id, video_frame, target_id, bodypart==target_bodypart
# ====================================================

def to_long_pose(df):
    """Normalize tracking df into long format: ['lab_id','video_id','video_frame','mouse_id','bodypart','x','y']"""
    df = df.copy()
    # unify frame name
    if 'video_frame' not in df.columns and 'frame' in df.columns:
        df = df.rename(columns={'frame':'video_frame'})
    base = [c for c in ['lab_id','video_id','video_frame','mouse_id'] if c in df.columns]

    # already long?
    if 'bodypart' in df.columns and {'x','y'}.issubset(df.columns):
        return df[base + ['bodypart','x','y']].copy()

    cols = df.columns.tolist()
    # Pattern A: x_<bp>, y_<bp>
    x_bp = [(c, c.split('x_',1)[1]) for c in cols
            if c.startswith('x_') and len(c) > 2 and f"y_{c.split('x_',1)[1]}" in cols]
    # Pattern B: <bp>_x, <bp>_y
    bpx = [(c, c[:-2]) for c in cols if c.endswith('_x') and (c[:-2] + '_y') in cols]

    long_rows = []
    if x_bp or bpx:
        if x_bp:
            for xcol, bp in x_bp:
                ycol = f'y_{bp}'
                sub = df[base + [xcol, ycol]].copy()
                sub['bodypart'] = bp
                sub = sub.rename(columns={xcol:'x', ycol:'y'})
                long_rows.append(sub)
        if bpx:
            for xcol, bp in bpx:
                ycol = f'{bp}_y'
                sub = df[base + [xcol, ycol]].copy()
                sub['bodypart'] = bp
                sub = sub.rename(columns={xcol:'x', ycol:'y'})
                long_rows.append(sub)
        return pd.concat(long_rows, ignore_index=True)[base + ['bodypart','x','y']]

    # Fallback center
    out = df[base].copy()
    if {'x','y'}.issubset(df.columns):
        out['x'] = df['x']; out['y'] = df['y']; out['bodypart'] = 'body_center'
    elif {'body_center_x','body_center_y'}.issubset(df.columns):
        out['x'] = df['body_center_x']; out['y'] = df['body_center_y']; out['bodypart'] = 'body_center'
    else:
        guess_x = [c for c in cols if re.search(r'(^x$|_x$|^x_|center_x$)', c)]
        guess_y = [c for c in cols if re.search(r'(^y$|_y$|^y_|center_y$)', c)]
        out['x'] = df[guess_x].mean(axis=1, skipna=True) if guess_x else np.nan
        out['y'] = df[guess_y].mean(axis=1, skipna=True) if guess_y else np.nan
        out['bodypart'] = 'center_mean'
    return out[base + ['bodypart','x','y']]

# --- Safety: inputs
if 'df_merged' not in globals():
    raise RuntimeError("df_merged is not defined.")
if 'df_full_tracking_all' not in globals():
    raise RuntimeError("df_full_tracking_all is not defined.")

# --- Dtype align on df_merged
df_merged['lab_id']      = df_merged['lab_id'].astype(str)
df_merged['video_id']    = df_merged['video_id'].astype(str)
df_merged['video_frame'] = df_merged['video_frame'].astype(int, errors='ignore')
df_merged['target_id']   = df_merged['target_id'].fillna(0).astype(int, errors='ignore')

# --- Normalize tracking to long + dtypes
trk_long = to_long_pose(df_full_tracking_all)
for k in ['lab_id','video_id']:
    if k in trk_long.columns:
        trk_long[k] = trk_long[k].astype(str)
if 'video_frame' in trk_long.columns:
    trk_long['video_frame'] = trk_long['video_frame'].astype(int, errors='ignore')
if 'mouse_id' in trk_long.columns:
    trk_long['mouse_id'] = trk_long['mouse_id'].astype(int, errors='ignore')

# --- Build right table with target coords
right_tbl = trk_long.rename(columns={
    'mouse_id': 'target_id',
    'bodypart': 'target_bodypart',
    'x': 'target_x',
    'y': 'target_y'
})[['lab_id','video_id','video_frame','target_id','target_bodypart','target_x','target_y']]

# --- LEFT JOIN on same bodypart
df_joined = df_merged.merge(
    right_tbl,
    left_on = ['lab_id','video_id','video_frame','target_id','bodypart'],
    right_on= ['lab_id','video_id','video_frame','target_id','target_bodypart'],
    how='left'
)

# If you want canonical column names, keep 'target_x','target_y' and 'target_bodypart'
# Drop the duplicate right-side key column (target_bodypart) only if the left already had one:
# (If left also had 'target_bodypart', prefer the joined one where available.)
if 'target_bodypart_x' in df_joined.columns and 'target_bodypart_y' in df_joined.columns:
    # rare case from earlier merges; ignore
    pass
elif 'target_bodypart' in df_joined.columns and 'target_bodypart_y' in df_joined.columns:
    # also rare; ignore
    pass
else:
    # nothing to do; 'target_bodypart' is the right-side column we just created

    # If you want to keep only one column name for the bodypart on the right:
    pass

print("Joined shape (rows preserved from df_merged):", df_joined.shape)
#print("Has target_x/target_y?", 'target_x' in df_joined.columns, 'target_y' in df_joined.columns)
#display(df_joined.head(8)[['lab_id','video_id','video_frame','mouse_id','bodypart','x','y','target_id','action','target_bodypart','target_x','target_y']])


Joined shape (rows preserved from df_merged): (36268844, 12)


In [None]:
display(df_joined[(df_joined['video_id'] == '1335286655') & (df_joined['video_frame'] == 1807)])

Unnamed: 0,lab_id,video_id,video_frame,mouse_id,bodypart,x,y,target_id,action,target_bodypart,target_x,target_y
35826136,UppityFerret,1335286655,1807,2,body_center,429.630768,108.147102,1,sniffgenital,body_center,291.476257,89.354416
35826137,UppityFerret,1335286655,1807,2,ear_left,375.57428,95.136604,1,sniffgenital,ear_left,274.16214,128.112534
35826138,UppityFerret,1335286655,1807,2,ear_right,386.850861,78.758545,1,sniffgenital,ear_right,256.082672,117.866013
35826139,UppityFerret,1335286655,1807,2,hip_left,434.237671,128.29213,1,sniffgenital,hip_left,312.042694,92.608467
35826140,UppityFerret,1335286655,1807,2,hip_right,452.410065,101.695229,1,sniffgenital,hip_right,289.378113,69.918938
35826141,UppityFerret,1335286655,1807,2,lateral_left,407.644318,115.246956,1,sniffgenital,lateral_left,295.105072,109.398476
35826142,UppityFerret,1335286655,1807,2,lateral_right,421.875519,90.393761,1,sniffgenital,lateral_right,272.420288,88.149788
35826143,UppityFerret,1335286655,1807,2,nose,361.141388,72.57859,1,sniffgenital,nose,259.204254,142.733337
35826144,UppityFerret,1335286655,1807,2,spine_1,407.004211,99.216606,1,sniffgenital,spine_1,277.383545,105.047379
35826145,UppityFerret,1335286655,1807,2,spine_2,451.77887,119.910675,1,sniffgenital,spine_2,305.98114,77.128525


In [None]:
# --- Setup ---
OUTPUT_DIR = '/kaggle/working/data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

file_path = os.path.join(OUTPUT_DIR, 'data_final.parquet')
df_joined.to_parquet(file_path, index=False)

In [None]:
del df_merged
del df_full_tracking_all
del df_full_tracking
del train_df
del valid_df
del df_tracking
del df_annotation
#del _6
gc.collect()

8

In [None]:
ID_COLS = ['mouse_id', 'target_id', 'target_bodypart']
df_joined.drop(columns=ID_COLS, axis=1, inplace=True)
#display(df_joined.head())

In [None]:
rename_map = {}
if 'x' in df_joined.columns:         rename_map['x'] = 'mouse_A_x'
if 'y' in df_joined.columns:         rename_map['y'] = 'mouse_A_y'
if 'target_x' in df_joined.columns:  rename_map['target_x'] = 'mouse_B_x'
if 'target_y' in df_joined.columns:  rename_map['target_y'] = 'mouse_B_y'

df_joined = df_joined.rename(columns=rename_map)

# move `action` to the far right (if present)
cols = df_joined.columns.tolist()
if 'action' in cols:
    cols_no_action = [c for c in cols if c != 'action']
    cols = cols_no_action + ['action']
    df_joined = df_joined[cols]

print("Renamed columns applied. Shape:", df_joined.shape)
display(df_joined.head())

Renamed columns applied. Shape: (36268844, 9)


Unnamed: 0,lab_id,video_id,video_frame,bodypart,mouse_A_x,mouse_A_y,mouse_B_x,mouse_B_y,action
0,AdaptableSnail,44566106,4,body_center,338.654999,468.442993,338.654999,468.442993,rear
1,AdaptableSnail,44566106,4,ear_left,396.808014,460.20401,396.808014,460.20401,rear
2,AdaptableSnail,44566106,4,ear_right,329.583008,514.291016,329.583008,514.291016,rear
3,AdaptableSnail,44566106,4,headpiece_bottombackright,378.06601,483.789001,378.06601,483.789001,rear
4,AdaptableSnail,44566106,4,headpiece_bottomfrontleft,370.57901,541.814026,370.57901,541.814026,rear


In [None]:
KEYS = ['lab_id','video_id','video_frame']
g = df_joined.groupby(KEYS, observed=True)

# Bounds across BOTH mice per group
Aminx = g['mouse_A_x'].transform('min'); Bminx = g['mouse_B_x'].transform('min')
Amaxx = g['mouse_A_x'].transform('max'); Bmaxx = g['mouse_B_x'].transform('max')
Aminy = g['mouse_A_y'].transform('min'); Bminy = g['mouse_B_y'].transform('min')
Amaxy = g['mouse_A_y'].transform('max'); Bmaxy = g['mouse_B_y'].transform('max')

xmin = np.minimum(Aminx, Bminx)
xmax = np.maximum(Amaxx, Bmaxx)
ymin = np.minimum(Aminy, Bminy)
ymax = np.maximum(Amaxy, Bmaxy)

# Avoid divide-by-zero
eps = 1e-6
den_x = (xmax - xmin).where((xmax - xmin) != 0, other=1.0)
den_y = (ymax - ymin).where((ymax - ymin) != 0, other=1.0)

# In-place normalize to [0,1]
df_joined['mouse_A_x'] = ((df_joined['mouse_A_x'] - xmin) / (den_x + eps)).clip(0, 1)
df_joined['mouse_A_y'] = ((df_joined['mouse_A_y'] - ymin) / (den_y + eps)).clip(0, 1)
df_joined['mouse_B_x'] = ((df_joined['mouse_B_x'] - xmin) / (den_x + eps)).clip(0, 1)
df_joined['mouse_B_y'] = ((df_joined['mouse_B_y'] - ymin) / (den_y + eps)).clip(0, 1)

print("Normalized in place. Shape:", df_joined.shape)
display(df_joined.head())

Normalized in place. Shape: (36268844, 9)


Unnamed: 0,lab_id,video_id,video_frame,bodypart,mouse_A_x,mouse_A_y,mouse_B_x,mouse_B_y,action
0,AdaptableSnail,44566106,4,body_center,0.263233,0.264633,0.263233,0.264633,rear
1,AdaptableSnail,44566106,4,ear_left,0.892431,0.198089,0.892431,0.198089,rear
2,AdaptableSnail,44566106,4,ear_right,0.165076,0.634933,0.165076,0.634933,rear
3,AdaptableSnail,44566106,4,headpiece_bottombackright,0.689648,0.388578,0.689648,0.388578,rear
4,AdaptableSnail,44566106,4,headpiece_bottomfrontleft,0.608641,0.857228,0.608641,0.857228,rear


In [None]:
gc.collect()

0

In [None]:
OUT_DIR = "/kaggle/working/skeleton_imgs"
CSV_PATH = "/kaggle/working/skeleton_labels.csv"
IMG_SIZE = 256  # square canvas

os.makedirs(OUT_DIR, exist_ok=True)

# Optional edges; points are always drawn even if an edge is missing
EDGES = [
    ("head", "nose"),
    ("head", "ear_left"), ("head", "ear_right"),
    ("head", "neck"),
    ("neck", "spine_1"), ("spine_1", "spine_2"),
    ("spine_2", "body_center"),
    ("body_center", "lateral_left"), ("body_center", "lateral_right"),
    ("spine_2", "hip_left"), ("spine_2", "hip_right"),
    ("body_center", "tail_base"),
    ("tail_base", "tail_midpoint"),
    ("tail_midpoint", "tail_middle_1"),
    ("tail_middle_1", "tail_middle_2"),
    ("tail_middle_2", "tail_tip"),
    # headpiece rings (drawn only if present)
    ("headpiece_topbackleft","headpiece_topbackright"),
    ("headpiece_topfrontleft","headpiece_topfrontright"),
    ("headpiece_topbackleft","headpiece_topfrontleft"),
    ("headpiece_topbackright","headpiece_topfrontright"),
    ("headpiece_bottombackleft","headpiece_bottombackright"),
    ("headpiece_bottomfrontleft","headpiece_bottomfrontright"),
    ("headpiece_bottombackleft","headpiece_bottomfrontleft"),
    ("headpiece_bottombackright","headpiece_bottomfrontright"),
    ("headpiece_bottomfrontleft","head"),
    ("headpiece_bottomfrontright","head"),
    ("headpiece_bottombackleft","head"),
    ("headpiece_bottombackright","head"),
]

def to_px(u, v, size=IMG_SIZE):
    # u,v in [0,1] -> pixel coords (col,row) with origin top-left
    u = 0.0 if pd.isna(u) else float(u)
    v = 0.0 if pd.isna(v) else float(v)
    x = int(np.clip(u, 0, 1) * (size - 1))
    y = int(np.clip(v, 0, 1) * (size - 1))
    return x, y

labels_rows = []

# Group by frame
for (vid, fr), g in df_joined.groupby(["video_id", "video_frame"], observed=True):
    # Frame label = majority action in this frame
    act_mode = g["action"].mode()
    act = act_mode.iloc[0] if not act_mode.empty else g["action"].iloc[0]

    # Collect bodypart -> (u,v) dicts for A and B
    ptsA, ptsB = {}, {}
    for _, r in g.iterrows():
        bp = r["bodypart"]
        if pd.notna(r["mouse_A_x"]) and pd.notna(r["mouse_A_y"]):
            ptsA[bp] = (float(r["mouse_A_x"]), float(r["mouse_A_y"]))
        if pd.notna(r["mouse_B_x"]) and pd.notna(r["mouse_B_y"]):
            ptsB[bp] = (float(r["mouse_B_x"]), float(r["mouse_B_y"]))

    if not ptsA and not ptsB:
        continue  # nothing to draw

    # Canvas (OpenCV is BGR)
    img = np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)

    # Draw edges first (A in RED, B in GREEN)
    for a, b in EDGES:
        if a in ptsA and b in ptsA:
            x1,y1 = to_px(*ptsA[a]); x2,y2 = to_px(*ptsA[b])
            cv2.line(img, (x1,y1), (x2,y2), (0,0,255), 2)
        if a in ptsB and b in ptsB:
            x1,y1 = to_px(*ptsB[a]); x2,y2 = to_px(*ptsB[b])
            cv2.line(img, (x1,y1), (x2,y2), (0,255,0), 2)

    # Draw joints (A in RED, B in GREEN)
    for bp, (u,v) in ptsA.items():
        x,y = to_px(u,v)
        cv2.circle(img, (x,y), 2, (0,0,255), -1)
    for bp, (u,v) in ptsB.items():
        x,y = to_px(u,v)
        cv2.circle(img, (x,y), 2, (0,255,0), -1)

    # Save image
    fn = f"{vid}_{int(fr)}.png"
    fp = os.path.join(OUT_DIR, fn)
    cv2.imwrite(fp, img)

    labels_rows.append([fp, vid, int(fr), act])

# Save labels CSV
labels = pd.DataFrame(labels_rows, columns=["image_path","video_id","video_frame","action"])
labels.to_csv(CSV_PATH, index=False)

print(f"Saved {len(labels)} images -> {OUT_DIR}")
print(f"Labels CSV -> {CSV_PATH}")
labels.head()


In [None]:
# Define the folder to compress and the name of the zip file
source_folder = '/kaggle/working/'
zip_filename = 'kaggle_working_output.zip'

# Create the zip file
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(source_folder):
        for file in files:
            file_path = os.path.join(root, file)
            # Add file to the zip archive, relative to the source folder
            zipf.write(file_path, os.path.relpath(file_path, source_folder))

print(f"Compressed all files into: {zip_filename}. Now download this file from the Output panel.")

In [None]:
# --- Setup ---
OUTPUT_DIR = '/kaggle/working/stratified_split'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Assuming your DataFrame is named 'df_merged'
# If df_merged is NOT defined, uncomment the next line to create a dummy:
# data = {'col1': range(100), 'action': ['A'] * 90 + ['B'] * 10}
# df_merged = pd.DataFrame(data)

# --- 1. Define Ratio and Stratify Target ---
# We want 90% in the 'train' file and 10% in the 'test' file.
TEST_SIZE_RATIO = 0.10  # 10% for the smaller file
y_actions = df_merged['action']

print(f"Total rows: {len(df_merged):,}")
print("-" * 40)

# --- 2. Perform Stratified Split ---
# This function splits the DataFrame's indices, using the 'action' column
# to ensure both sets maintain the same proportion of classes.
# We use indices to avoid creating two full copies of the DataFrame in memory.

# Since df_merged.index is the array-like input, the output is indices.
train_indices, test_indices = train_test_split(
    df_merged.index,
    test_size=TEST_SIZE_RATIO,
    shuffle=True,
    random_state=42,
    # Stratify ensures the split respects the class distribution
    stratify=y_actions
)

# --- 3. Save 90% (Train) Data ---
df_train = df_merged.loc[train_indices]
train_file_path = os.path.join(OUTPUT_DIR, 'data_train_90percent.parquet')
df_train.to_parquet(train_file_path, index=False)

# Optional verification
unique_train = df_train['action'].nunique()

print(f"Saved {len(df_train):,} rows to data_train_90percent.parquet (90%)")
print(f"  --> Unique actions in file: {unique_train}")

# --- 4. Save 10% (Test) Data ---
df_test = df_merged.loc[test_indices]
test_file_path = os.path.join(OUTPUT_DIR, 'data_test_10percent.parquet')
df_test.to_parquet(test_file_path, index=False)

# Optional verification
unique_test = df_test['action'].nunique()

print(f"Saved {len(df_test):,} rows to data_test_10percent.parquet (10%)")
print(f"  --> Unique actions in file: {unique_test}")

print("-" * 40)
print(f"Partitioning complete. Files saved in '{OUTPUT_DIR}'.")

Total rows: 36,268,844
----------------------------------------
Saved 32,641,959 rows to data_train_90percent.parquet (90%)
  --> Unique actions in file: 37
Saved 3,626,885 rows to data_test_10percent.parquet (10%)
  --> Unique actions in file: 37
----------------------------------------
Partitioning complete. Files saved in '/kaggle/working/stratified_split'.


In [None]:
del df_merged
del y_actions
del df_test
del train_df
del valid_df
del df_tracking
del df_annotation
#del _6
gc.collect()

0

In [None]:
gc.collect()

0

In [None]:
# Define your data directory (assuming you placed the 'parquet_chunks' folder
# in the same working directory or a linked dataset)
OUTPUT_DIR = '/kaggle/working/stratified_split'

# --- Load and Prepare Data ---

# 1. Load All Parquet Files
try:
    all_files = glob.glob(os.path.join(OUTPUT_DIR, "data_train_90percent.parquet"))
    list_of_dfs = [pd.read_parquet(f) for f in all_files]
    df_half = pd.concat(list_of_dfs, ignore_index=True)
    print(f"Successfully loaded {len(all_files)} files.")
    print(f"Total rows: {len(df_half):,}")
except Exception as e:
    print(f"Error loading files. Check directory path: {e}")
    # Create a dummy DataFrame if loading fails to prevent kernel crash
    # df_final = pd.DataFrame()

Successfully loaded 1 files.
Total rows: 32,641,959


In [None]:
#nil_count = (df_half['action'] != 'NIL').sum()
#print(f"The number of 'NIL' actions is: {nil_count}")

In [None]:
# --- 1. Define the Index Columns ---
# These columns will define the unique rows in the resulting DataFrame.
ID_COLS = ['lab_id', 'video_id', 'video_frame', 'mouse_id', 'target_id']

# --- 2. Separate Action for Merging ---
# Since 'Action' is constant for a given combination of ID_COLS,
# we extract it separately to avoid issues with the pivot,
# then merge it back later. This is often necessary when the value
# column (like 'x'/'y') is not strictly unique.


# Keep only the unique combinations of ID_COLS and action
action_df = df_half[ID_COLS + ['action']].drop_duplicates()

# --- 3. Perform the Pivot Operation with Aggregation ---
# Use pivot_table to handle duplicate index/column entries by taking the mean.
df_pivoted = df_half.pivot_table(
    index=ID_COLS,           # The columns that form the new unique row identifier
    columns='bodypart',      # The column whose unique values become the new column headers
    values=['x', 'y'],       # The columns whose values will be aggregated
    aggfunc='mean'           # CRITICAL: Calculates the mean of the 'x' and 'y' duplicates
)

# --- 4. Clean Up Column Names ---
# The pivot operation creates multi-level columns (e.g., ('x', 'headpiece_bottombackright')).
# We flatten and rename them for easier use: 'bodypart_x', 'bodypart_y'.
df_pivoted.columns = [f'{col[1]}_{col[0]}' for col in df_pivoted.columns.values]

# --- 5. Reset Index and Merge Action ---
# Reset the index to turn the ID_COLS back into regular columns
df_pivoted = df_pivoted.reset_index()

# Merge the action column back into the pivoted DataFrame
df_final = pd.merge(
    df_pivoted,
    action_df,
    on=ID_COLS,
    how='left'  # Use a left merge to keep all the pivoted data
)

# --- 6. Reorder Columns for Clarity ---
# Move 'action' to be near the ID columns
final_columns = ID_COLS + ['action'] + [col for col in df_final.columns if col not in ID_COLS + ['action']]
df_final = df_final[final_columns]


In [None]:
df_final.drop(columns=ID_COLS, axis=1, inplace=True)
print("--- Final Reshaped DataFrame Head ---")
print(df_final.head())
print(f"\nFinal DataFrame Shape: {df_final.shape}")

--- Final Reshaped DataFrame Head ---
  action  body_center_x  ear_left_x  ear_right_x  head_x  \
0  chase     495.295013  493.692993   517.568970     NaN   
1  chase     494.980011  493.643005   517.301025     NaN   
2  chase     495.061005  493.787994   519.041992     NaN   
3  chase     495.018005  494.544006   519.635010     NaN   
4  chase     495.566010  496.365997   520.189026     NaN   

   headpiece_bottombackleft_x  headpiece_bottombackright_x  \
0                         NaN                          NaN   
1                         NaN                          NaN   
2                         NaN                          NaN   
3                         NaN                          NaN   
4                         NaN                          NaN   

   headpiece_bottomfrontleft_x  headpiece_bottomfrontright_x  \
0                          NaN                           NaN   
1                          NaN                           NaN   
2                          NaN      

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [None]:
#count_total = sum(1 for col in df_final.columns if col.endswith('_x') or col.endswith('_y'))
#print(f"Total columns ending with '_x' or '_y': {count_total}")

In [None]:
# --- Setup ---
# 1. Define the directory where files will be saved
OUTPUT_DIR = '/kaggle/working/train'
# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2. Define the maximum rows per file (1.8 million)
CHUNK_SIZE = 1_800_0000

# 1. Calculate the number of chunks needed
num_rows = len(df_final)
num_chunks = int(np.ceil(num_rows / CHUNK_SIZE))
print(f"Total rows: {num_rows:,}")
print(f"Saving to {num_chunks} files with a max size of {CHUNK_SIZE:,} rows.")

# 2. Split the DataFrame indices into chunks
# This is memory efficient as it only creates small lists/arrays of index labels,
# NOT copies of the DataFrame data itself.
indices_chunks = np.array_split(df_final.index, num_chunks)

# 3. Loop through the chunks and save to Parquet files
for i, chunk_indices in enumerate(indices_chunks):

    # Select the slice of the DataFrame using .loc[]
    # This loads only the current chunk into memory
    df_chunk = df_final.loc[chunk_indices]

    # Define the output file path
    file_name = f'chunk_{i+1:03d}.parquet'
    file_path = os.path.join(OUTPUT_DIR, file_name)

    # Save the chunk to a Parquet file
    df_chunk.to_parquet(file_path, index=False)

    print(f"Saved {len(df_chunk):,} rows to {file_name}")

print(f"Partitioning complete. Files saved in the '{OUTPUT_DIR}' directory.")

Total rows: 5,353,673
Saving to 1 files with a max size of 18,000,000 rows.
Saved 5,353,673 rows to chunk_001.parquet
Partitioning complete. Files saved in the '/kaggle/working/train' directory.


In [None]:
del action_df
del df_final
del df_half
del df_pivoted
del df_chunk
gc.collect()


0

In [None]:
# Define your data directory (assuming you placed the 'parquet_chunks' folder
# in the same working directory or a linked dataset)
OUTPUT_DIR = '/kaggle/working/train'

# --- Load and Prepare Data ---

# 1. Load All Parquet Files
try:
    all_files = glob.glob(os.path.join(OUTPUT_DIR, "*.parquet"))
    list_of_dfs = [pd.read_parquet(f) for f in all_files]
    df_final = pd.concat(list_of_dfs, ignore_index=True)
    print(f"Successfully loaded {len(all_files)} files.")
    print(f"Total rows: {len(df_final):,}")
except Exception as e:
    print(f"Error loading files. Check directory path: {e}")
    # Create a dummy DataFrame if loading fails to prevent kernel crash
    # df_final = pd.DataFrame()

print(f"Total NaN values remaining: {df_final.isna().sum().sum()}")

Successfully loaded 1 files.
Total rows: 5,353,673
Total NaN values remaining: 202399732


In [None]:
# Assuming 'df_final' is your DataFrame after concatenating all Parquet files

# 1. Forward Fill (ffill)
# This uses the LAST VALID observation to fill the missing values.
# This is ideal for time-series data, assuming the mouse stayed in the last known pose.
print("Starting Forward Fill (ffill)...")
df_final.fillna(method='ffill', inplace=True)
print("Forward Fill complete.")

# 2. Backward Fill (bfill) - CRITICAL STEP
# After ffill, any NaNs remaining are those at the very beginning of the dataset
# (where there is no preceding observation). bfill uses the NEXT valid observation.
print("Starting Backward Fill (bfill)...")
df_final.fillna(method='bfill', inplace=True)
print("Backward Fill complete. All NaN values should now be resolved.")

# You can add a quick check to verify:
print(f"Total NaN values remaining: {df_final.isna().sum().sum()}")

Starting Forward Fill (ffill)...


  df_final.fillna(method='ffill', inplace=True)


Forward Fill complete.
Starting Backward Fill (bfill)...


  df_final.fillna(method='bfill', inplace=True)


Backward Fill complete. All NaN values should now be resolved.
Total NaN values remaining: 0


In [None]:
print(df_final.head())

  action  body_center_x  ear_left_x  ear_right_x      head_x  \
0  chase     495.295013  493.692993   517.568970  248.142365   
1  chase     494.980011  493.643005   517.301025  248.142365   
2  chase     495.061005  493.787994   519.041992  248.142365   
3  chase     495.018005  494.544006   519.635010  248.142365   
4  chase     495.566010  496.365997   520.189026  248.142365   

   headpiece_bottombackleft_x  headpiece_bottombackright_x  \
0                 1122.628052                  1119.015015   
1                 1122.628052                  1119.015015   
2                 1122.628052                  1119.015015   
3                 1122.628052                  1119.015015   
4                 1122.628052                  1119.015015   

   headpiece_bottomfrontleft_x  headpiece_bottomfrontright_x  \
0                  1132.064941                   1073.786987   
1                  1132.064941                   1073.786987   
2                  1132.064941                   1

In [None]:
# 2. Separate Features (X) and Target (y)
feature_cols = [col for col in df_final.columns if col.endswith('_x') or col.endswith('_y')]
X_all = df_final[feature_cols].values # Features (coordinates)
y_all = df_final['action'].values     # Target (action label)


In [None]:
# --- 1. Calculate Raw Class Frequencies ---
# Convert the NumPy array of training labels (y_all or y_train) to a Series
# and use value_counts() to get the frequency of each action string.
# NOTE: Replace 'y_all' with the variable containing your training action labels.
action_counts = pd.Series(y_all).value_counts().sort_index()

# --- 2. Align Counts with LabelEncoder Order ---
# CRITICAL: Reindex the calculated counts using the order of le.classes_
# This ensures Class 0 (e.g., 'NIL') corresponds to Index 0 in the weights tensor, and so on.
# Use fill_value=0 in case any class is missing from the subset (e.g., if using y_train)
# 3. Encode Action Labels (NIL, NON-NIL, etc. to 0, 1, ...)
le = LabelEncoder()
y_encoded = le.fit_transform(y_all)
counts_aligned = action_counts.reindex(le.classes_, fill_value=0)

# --- 3. Calculate Inverse Frequency Weights ---
total_samples = counts_aligned.sum()
num_classes = len(counts_aligned)

# Calculate weights: Use the inverse frequency method
# Higher weight for less frequent classes
class_weights = total_samples / (num_classes * counts_aligned.values)

In [None]:
del df_final

In [None]:
NUM_CLASSES = len(le.classes_)
print(f"Action classes encoded to {NUM_CLASSES} classes.")

Action classes encoded to 37 classes.


In [None]:
# 4. Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
# Convert to PyTorch Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

In [None]:
del y_all
del X_all
del y_encoded
del X_train
del X_val
del y_train
del y_val
gc.collect()

0

In [None]:
# --- Configuration ---
SEQUENCE_LENGTH = 30  # How many previous frames to include in one sequence sample
BATCH_SIZE = 1024     # Size of data batches for training
INPUT_SIZE = X_train_tensor.shape[1] # Number of features (e.g., 50 if 25 bodyparts * 2 coords)
print(INPUT_SIZE)
# --- Custom Dataset Class for Sequence Data ---
class PoseSequenceDataset(Dataset):
    def __init__(self, features, labels, sequence_length):
        self.features = features
        self.labels = labels
        self.sequence_length = sequence_length
        self.indices = self._create_indices()

    def _create_indices(self):
        # Create indices for the start of each sequence.
        # The last possible start index is len(features) - sequence_length
        return np.arange(len(self.features) - self.sequence_length)

    def __len__(self):
        # The number of available sequences
        return len(self.indices)

    def __getitem__(self, idx):
        start_idx = self.indices[idx]
        end_idx = start_idx + self.sequence_length

        # X: Sequence of features (e.g., 30 frames x 50 coordinates)
        x_sequence = self.features[start_idx:end_idx]

        # Y: The label for the *last frame* in the sequence
        y_label = self.labels[end_idx - 1]

        return x_sequence, y_label

# Create Dataset and DataLoader instances
train_dataset = PoseSequenceDataset(X_train_tensor, y_train_tensor, SEQUENCE_LENGTH)
val_dataset = PoseSequenceDataset(X_val_tensor, y_val_tensor, SEQUENCE_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

50


In [None]:
# --- Model Definition ---
class MouseActionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(MouseActionLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # 1. LSTM Layer: Processes the time sequence
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        # 2. Fully Connected Layer: Maps the final hidden state to the class prediction
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Initialize hidden state and cell state (optional, but good practice)
        # h0 and c0 will be created automatically if not passed, but explicit is cleaner

        # Pass the sequence through the LSTM
        # out has shape (batch_size, sequence_length, hidden_size)
        out, _ = self.lstm(x)

        # We only care about the output from the LAST frame in the sequence
        # out[:, -1, :] extracts the last time step output
        out = self.fc(out[:, -1, :])
        return out

# Instantiate the model
HIDDEN_SIZE = 128
NUM_LAYERS = 2
model = MouseActionLSTM(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, NUM_CLASSES)

print("LSTM model defined successfully.")

LSTM model defined successfully.


In [None]:
# --- Training Configuration ---
LEARNING_RATE = 0.00005
NUM_EPOCHS = 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Assuming class_weights_tensor is defined and moved to device
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(DEVICE)
print(class_weights_tensor[:5])

# Move model to device (GPU if available)
model.to(DEVICE)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# --- Training Loop ---
for epoch in range(NUM_EPOCHS):
    start_time = time.perf_counter()
    model.train()
    total_loss = 0

    for i, (sequences, labels) in enumerate(train_loader):
        # ... (Training steps: Forward, Loss, Backward, Step) ...
        sequences = sequences.to(DEVICE)
        labels = labels.to(DEVICE)

        outputs = model(sequences)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)

    # --- Validation ---
    model.eval()
    with torch.no_grad():

        # Lists to store all labels and predictions for per-class analysis
        val_true_labels = []
        val_predicted_indices = []

        for sequences, labels in val_loader:
            sequences = sequences.to(DEVICE)
            labels = labels.to(DEVICE)

            outputs = model(sequences)
            _, predicted = torch.max(outputs.data, 1)

            # Accumulate data
            val_true_labels.extend(labels.cpu().numpy())
            val_predicted_indices.extend(predicted.cpu().numpy())

        # Convert accumulated lists to NumPy arrays
        val_true_labels = np.array(val_true_labels)
        val_predicted_indices = np.array(val_predicted_indices)

        # Calculate Overall Accuracy
        total_samples = len(val_true_labels)
        overall_correct = np.sum(val_predicted_indices == val_true_labels)
        overall_accuracy = 100 * overall_correct / total_samples

        # --- Calculate Per-Class Accuracy ---
        class_names = le.classes_ # Get class names from the fitted LabelEncoder

        print("\n--- Individual Class Validation Accuracy ---")

        # Loop through each class index (0 to NUM_CLASSES - 1)
        for i in range(NUM_CLASSES):
            # 1. Get indices where the true label matches the current class 'i'
            class_indices = (val_true_labels == i)

            # 2. Get the total number of samples for this class
            class_total = np.sum(class_indices)

            if class_total == 0:
                continue

            # 3. Get the predictions ONLY for this class's samples
            class_correct = np.sum(val_predicted_indices[class_indices] == i)

            # 4. Calculate accuracy
            class_accuracy = 100 * class_correct / class_total

            # Print results
            print(f"  {class_names[i]:<20}: {class_accuracy:.2f}% ({class_correct}/{class_total})")

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time

    print(f'\nEpoch [{epoch+1}/{NUM_EPOCHS}], Loss: {avg_loss:.4f}, Overall Validation Accuracy: {overall_accuracy:.2f}%')
    print(f"The operation took {elapsed_time:.4f} seconds to complete.")

print("Training complete!")

tensor([19.3700,  1.6311,  0.2793, 26.1937,  6.4621], device='cuda:0')

--- Individual Class Validation Accuracy ---
  allogroom           : 41.37% (618/1494)
  approach            : 0.00% (0/17742)
  attack              : 8.04% (8327/103629)
  attemptmount        : 0.09% (1/1105)
  avoid               : 38.23% (1712/4478)
  biteobject          : 21.40% (101/472)
  chase               : 10.03% (480/4785)
  chaseattack         : 48.24% (451/935)
  climb               : 49.63% (5911/11911)
  defend              : 6.22% (1133/18218)
  dig                 : 0.09% (14/15948)
  disengage           : 95.24% (2343/2460)
  dominance           : 99.77% (7679/7697)
  dominancegroom      : 71.80% (685/954)
  dominancemount      : 43.05% (1537/3570)
  ejaculate           : 73.83% (189/256)
  escape              : 0.20% (37/18398)
  exploreobject       : 10.04% (76/757)
  flinch              : 10.22% (38/372)
  follow              : 82.04% (6588/8030)
  freeze              : 90.87% (5254/5782)
  gen

In [None]:
# Iterate through the classes_ array. The index 'i' is the encoded value.
for i, class_label in enumerate(le.classes_):
    print(f"     {i:<12} | {class_label}")

In [None]:
MODEL_DIR = '/kaggle/working/model'
# Create the output directory if it doesn't exist
os.makedirs(MODEL_DIR, exist_ok=True)

MODEL_FILE = os.path.join(MODEL_DIR, 'mouse_action_lstm.pth')

# Ensure your model is on the CPU before saving to avoid GPU compatibility issues when loading
model.to('cpu')

# Save only the model's state dictionary
torch.save(model.state_dict(), MODEL_FILE)

print(f"Model successfully saved to: {MODEL_FILE}")

In [None]:
gc.collect()

In [None]:
# Define your data directory (assuming you placed the 'parquet_chunks' folder
# in the same working directory or a linked dataset)
OUTPUT_DIR = '/kaggle/working/stratified_split'

# --- Load and Prepare Data ---

# 1. Load All Parquet Files
try:
    all_files = glob.glob(os.path.join(OUTPUT_DIR, "data_test_10percent.parquet"))
    list_of_dfs = [pd.read_parquet(f) for f in all_files]
    df_test = pd.concat(list_of_dfs, ignore_index=True)
    print(f"Successfully loaded {len(all_files)} files.")
    print(f"Total rows: {len(df_test):,}")
except Exception as e:
    print(f"Error loading files. Check directory path: {e}")
    # Create a dummy DataFrame if loading fails to prevent kernel crash
    # df_final = pd.DataFrame()

In [None]:
# 1. Count the true labels in the original test labels
print("True label counts in the data used for inference:")
# Use the un-encoded labels before they went into the DataLoader
print(df_test['action'].value_counts())

In [None]:
# --- 1. Define the Index Columns ---
# These columns will define the unique rows in the resulting DataFrame.
ID_COLS = ['lab_id', 'video_id', 'video_frame', 'mouse_id', 'target_id']

# --- 2. Separate Action for Merging ---
# Since 'Action' is constant for a given combination of ID_COLS,
# we extract it separately to avoid issues with the pivot,
# then merge it back later. This is often necessary when the value
# column (like 'x'/'y') is not strictly unique.

# Keep only the unique combinations of ID_COLS and action
action_df = df_test[ID_COLS + ['action']].drop_duplicates()

# --- 3. Perform the Pivot Operation ---
# This transforms the 'bodypart' rows into columns.
df_pivoted = df_test.pivot_table(
    index=ID_COLS,           # The columns that form the new unique row identifier
    columns='bodypart',      # The column whose unique values become the new column headers
    values=['x', 'y'],       # The columns whose values will be aggregated
    aggfunc='mean'           # CRITICAL: Calculates the mean of the 'x' and 'y' duplicates
)

# --- 4. Clean Up Column Names ---
# The pivot operation creates multi-level columns (e.g., ('x', 'headpiece_bottombackright')).
# We flatten and rename them for easier use: 'bodypart_x', 'bodypart_y'.
df_pivoted.columns = [f'{col[1]}_{col[0]}' for col in df_pivoted.columns.values]

# --- 5. Reset Index and Merge Action ---
# Reset the index to turn the ID_COLS back into regular columns
df_pivoted = df_pivoted.reset_index()

# Merge the action column back into the pivoted DataFrame
df_final = pd.merge(
    df_pivoted,
    action_df,
    on=ID_COLS,
    how='left'  # Use a left merge to keep all the pivoted data
)

# --- 6. Reorder Columns for Clarity ---
# Move 'action' to be near the ID columns
final_columns = ID_COLS + ['action'] + [col for col in df_final.columns if col not in ID_COLS + ['action']]
df_final = df_final[final_columns]


In [None]:
df_final.drop(columns=ID_COLS, axis=1, inplace=True)
print("--- Final Reshaped DataFrame Head ---")
print(df_final.head())
print(f"\nFinal DataFrame Shape: {df_final.shape}")

In [None]:
INPUT_SIZE = sum(1 for col in df_final.columns if col.endswith('_x') or col.endswith('_y'))
print(INPUT_SIZE)

NUM_CLASSES = len(df_final['action'].unique())
print(NUM_CLASSES)

In [None]:
# --- Model Definition ---
class MouseActionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(MouseActionLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # 1. LSTM Layer: Processes the time sequence
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        # 2. Fully Connected Layer: Maps the final hidden state to the class prediction
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Initialize hidden state and cell state (optional, but good practice)
        # h0 and c0 will be created automatically if not passed, but explicit is cleaner

        # Pass the sequence through the LSTM
        # out has shape (batch_size, sequence_length, hidden_size)
        out, _ = self.lstm(x)

        # We only care about the output from the LAST frame in the sequence
        # out[:, -1, :] extracts the last time step output
        out = self.fc(out[:, -1, :])
        return out

# Instantiate the model
HIDDEN_SIZE = 128
NUM_LAYERS = 2
MODEL_FILE = '/kaggle/working/model/mouse_action_lstm.pth'
loaded_model  = MouseActionLSTM(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, NUM_CLASSES)
loaded_model.load_state_dict(torch.load(MODEL_FILE))
print("LSTM model defined successfully.")

In [None]:
class PoseSequenceDataset(Dataset):
    def __init__(self, features, labels, sequence_length):
        self.features = features
        self.labels = labels
        self.sequence_length = sequence_length
        self.indices = self._create_indices()

    def _create_indices(self):
        # Create indices for the start of each sequence.
        # The last possible start index is len(features) - sequence_length
        return np.arange(len(self.features) - self.sequence_length)

    def __len__(self):
        # The number of available sequences
        return len(self.indices)

    def __getitem__(self, idx):
        start_idx = self.indices[idx]
        end_idx = start_idx + self.sequence_length

        # X: Sequence of features (e.g., 30 frames x 50 coordinates)
        x_sequence = self.features[start_idx:end_idx]

        # Y: The label for the *last frame* in the sequence
        y_label = self.labels[end_idx - 1]

        return x_sequence, y_label

# --- Configuration (MUST match training) ---
SEQUENCE_LENGTH = 30  # Same as used for training
# Assuming you have the trained LabelEncoder 'le' from the training script
# If not, you must save and load it, or recreate it with ALL known classes.
# For simplicity, we assume le is available.

# --- 1. Impute NaN Values ---
print("Applying ffill and bfill to df_test...")
# Select only the feature columns for imputation
feature_cols = [col for col in df_final.columns if col.endswith('_x') or col.endswith('_y')]

# Apply FFill and BFill in sequence
df_final[feature_cols].fillna(method='ffill', inplace=True)
df_final[feature_cols].fillna(method='bfill', inplace=True)

# --- 2. Separate Features (X_test) and Target (y_test) ---
X_test_np = df_final[feature_cols].values
y_test_labels = df_final['action'].values # Keep original labels for comparison
y_test_encoded = le.transform(y_test_labels)
# Convert to PyTorch Tensor
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)

# --- 3. Create Sequence Dataset ---
# We reuse the PoseSequenceDataset class defined during training.
# Since we need to match the structure, we use a simple DataLoader.

# IMPORTANT: Skip encoding the test labels to suppress the ValueError
# We use a placeholder tensor for the labels, which will be the correct size.
# NOTE: This means you CANNOT use 'y_test_tensor' to calculate accuracy with le.transform()
# You must handle the evaluation comparison manually later.

# ORIGINAL LINE (Caused Error): y_test_encoded = le.transform(y_test_labels)
# ORIGINAL LINE (Caused Error): y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

test_dataset = PoseSequenceDataset(X_test_tensor, y_test_tensor, SEQUENCE_LENGTH)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

print(f"Test data ready. Total sequences: {len(test_dataset):,}")

In [None]:
del df_final
del df_test
del df_pivoted
del X_test_np
del y_test_labels
del y_test_tensor
del test_dataset

In [None]:
# --- Inference Execution ---

# Ensure the model is in evaluation mode and on the correct device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loaded_model.to(DEVICE)
loaded_model.eval()

all_predictions = []
all_true_labels = []

print("Starting inference...")

with torch.no_grad(): # Essential: disables gradient calculation to save memory and speed
    for sequences, labels in test_loader:

        sequences = sequences.to(DEVICE)

        # 1. Forward Pass
        outputs = loaded_model(sequences)

        # 2. Get Predicted Class Index
        # torch.max returns (max_value, max_index). We want the index (the class ID).
        _, predicted_indices = torch.max(outputs.data, 1)

        # Store predictions and true labels
        all_predictions.extend(predicted_indices.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy()) # Store true encoded labels

print("Inference complete.")



In [None]:
# --- 3. Decode and Evaluate ---
# Decode the predicted indices back into their original string labels
print(le)
predicted_actions = le.inverse_transform(all_predictions)

# Create a final DataFrame for review
results_df = pd.DataFrame({
    'True_Action': le.inverse_transform(all_true_labels),
    'Predicted_Action': predicted_actions
})

# Calculate Final Accuracy
final_accuracy = accuracy_score(all_true_labels, all_predictions)

print("\n--- Inference Results ---")
print(f"Overall Test Accuracy: {final_accuracy:.4f}")
print("\nSample Predictions:")
print(results_df)

In [None]:
# Filter the DataFrame where the 'True_Action' column is not 'NIL'
non_nil_actions = results_df[results_df['Predicted_Action'] != 'NIL']

# Display the resulting DataFrame
print("--- Rows where True_Action is NOT 'NIL' ---")
print(len(non_nil_actions))