In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from tqdm import tqdm
import os
import random
import ipywidgets as widgets
import warnings
import json
import gc
import cv2
from collections import defaultdict

In [87]:
!rm -rf /kaggle/working/*

In [88]:
class CFG:
    train_path = "/kaggle/input/MABe-mouse-behavior-detection/train.csv"
    test_path = "/kaggle/input/MABe-mouse-behavior-detection/test.csv"
    sample_submission_path = "/kaggle/input/MABe-mouse-behavior-detection/sample_submission.csv"
    train_annotation_path = "/kaggle/input/MABe-mouse-behavior-detection/train_annotation"
    # train_tracking_path = "/kaggle/input/MABe-mouse-behavior-detection/train_tracking"
    # test_tracking_path = "/kaggle/input/MABe-mouse-behavior-detection/test_tracking"
    second_threshold = 0.1
    output_path = f"/kaggle/working/filtered_{second_threshold}_train_annotation"
    
    num_frames = 1000

In [89]:
train_df = pd.read_csv(CFG.train_path)

In [90]:
"""
Remove MABe labs (missing)
Count mice in per frame
"""
print(f"Original: {train_df.shape}")

# Remove 'MABe22_keypoints', 'MABe22_movies' from train data
remove_list = ["MABe22_keypoints", "MABe22_movies"]
train_without_mabe_df = train_df[~train_df["lab_id"].isin(remove_list)].copy()

print(f"After removing MABe labs: {train_without_mabe_df.shape}")
train_without_mabe_df['frames_per_second'].head()

Original: (8789, 38)
After removing MABe labs: (863, 38)


0    30.0
1    25.0
2    30.0
3    30.0
4    30.0
Name: frames_per_second, dtype: float64

In [91]:
sample = pd.read_parquet("/kaggle/input/MABe-mouse-behavior-detection/train_annotation/AdaptableSnail/1212811043.parquet")
sample

Unnamed: 0,agent_id,target_id,action,start_frame,stop_frame
0,1,3,chase,2,54
1,1,3,chase,128,234
2,3,2,avoid,324,342
3,3,1,avoid,324,342
4,1,2,chase,942,1052
...,...,...,...,...,...
365,1,2,chase,86668,86689
366,1,4,avoid,86815,86845
367,1,3,attack,89171,89194
368,4,2,avoid,89272,89299


# Analysis short-action labels

In [92]:
frames_per_second = train_without_mabe_df['frames_per_second'].iloc[0]
threshold_frames = int(frames_per_second * CFG.second_threshold)

print("Examle")
print(f"frames per second: {frames_per_second}")
print(f"consider less than {short_action_frames} frames are noise (too short)")

Examle
frames per second: 30.0
consider less than 3 frames are noise (too short)


In [93]:
from pathlib import Path
import pandas as pd

all_actions_df = pd.DataFrame()

for _, row in train_without_mabe_df.iterrows():
    lab_id = row["lab_id"]
    video_id = row["video_id"]

    frames_per_second = row["frames_per_second"]

    try:
        file_annot_path = (
            Path(CFG.train_annotation_path)
            / lab_id
            / f"{video_id}.parquet"
        )
        annotation_df = pd.read_parquet(file_annot_path)

        annotation_df = annotation_df.copy()
        annotation_df["lab_id"] = lab_id
        annotation_df["video_id"] = video_id
        annotation_df["fps"] = frames_per_second
        annotation_df["duration_frames"] = (
            annotation_df["stop_frame"] - annotation_df["start_frame"]
        )
        annotation_df["duration_sec"] = annotation_df["duration_frames"] / frames_per_second

        all_actions_df = pd.concat([all_actions_df, annotation_df], ignore_index=True)

    except Exception:
        pass

In [94]:
mean_frames = all_actions_df["duration_frames"].mean()
mean_seconds = all_actions_df["duration_sec"].mean()
med_frames = all_actions_df["duration_frames"].median()
med_seconds = all_actions_df["duration_sec"].median()
max_frames = all_actions_df["duration_frames"].max()
max_seconds = all_actions_df["duration_sec"].max()

print("avg frames for one action:", mean_frames)
print("avg second for one action:", mean_seconds)
print()
print("med frames for one action:", med_frames)
print("med second for one action:", med_seconds)
print()
print("max frames for one action:", max_frames)
print("max second for one action:", max_seconds)

avg frames for one action: 63.708981104998806
avg second for one action: 2.1260261764203903

med frames for one action: 28.0
med second for one action: 0.9666666666666667

max frames for one action: 13906
max second for one action: 463.53333333333336


In [95]:
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]

result = []

for th in thresholds:
    cond = all_actions_df["duration_sec"] <= th
    ratio = cond.mean() 
    result.append({"threshold_sec": th, "ratio": ratio})

ratio_df = pd.DataFrame(result)
ratio_df

Unnamed: 0,threshold_sec,ratio
0,0.1,0.014614
1,0.2,0.070868
2,0.3,0.136558
3,0.4,0.206769
4,0.5,0.272614


# Remove short-action labels

In [96]:
def remove_short_action(annot_df, threshold_frames):
    
    annot_df = annot_df.copy()
    annot_df['is_short'] = (annot_df['stop_frame'] - annot_df['start_frame']) <= threshold_frames
    
    return annot_df[annot_df['is_short']!=True]

In [99]:
from pathlib import Path
import gc
import joblib

OUTPUT_ROOT = Path(CFG.output_path)
OUTPUT_ROOT.mkdir(exist_ok=True, parents=True)

for _, row in train_without_mabe_df.iterrows():
    lab_id = row["lab_id"]
    video_id = row["video_id"]

    frames_per_second = row['frames_per_second']
    threshold_frames = frames_per_second * CFG.second_threshold

    try:
        file_annot_path = (
            Path(CFG.train_annotation_path)
            / lab_id
            / f"{video_id}.parquet"
        )
        annotation_df = pd.read_parquet(file_annot_path)
        cleaned_annotation_df = remove_short_action(annotation_df, threshold_frames)
        
        # save
        save_dir = OUTPUT_ROOT / lab_id
        save_dir.mkdir(exist_ok=True, parents=True)
        save_path = save_dir / f"{video_id}.parquet"

        cleaned_annotation_df.to_parquet(save_path, index=False)
        # print(f"Saved {save_path}")
    
    except Exception as e:
        pass
        # print(f"Error in {lab_id}/{video_id} -> {e}")

In [102]:
original = pd.read_parquet("/kaggle/input/MABe-mouse-behavior-detection/train_annotation/AdaptableSnail/1212811043.parquet")
filtered = pd.read_parquet("/kaggle/working/filtered_0.1_train_annotation/AdaptableSnail/1212811043.parquet")

print(original.shape)
print(filtered.shape)

(370, 5)
(368, 6)


In [108]:
# Removed rows
removed = original.merge(
    filtered,
    on=["start_frame", "stop_frame", "action"],
    how="left",
    indicator=True
).query('_merge == "left_only"').drop(columns=["_merge"])

removed.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,agent_id_x,target_id_x,action,start_frame,stop_frame,agent_id_y,target_id_y,is_short
84,1,2,chaseattack,8068,8070,,,
228,1,4,chase,24332,24334,,,
