## this notebook is to extract all the investigation events
1. Step 1: extract all investigation instances
2. Step 2:extract the informationa and features during the investigation
   - **sequence:** its video sequence source;
   - **investigation_ID:** the index of the investigation within a video sequence, starting from 0;
   - **pair_label:** index of the investigation_index of the paired attack, so 1_0 should be, the first investigation of the video sequence, no paired attack; 2_1 means the second investigation of the video sequence, paired with the first attack of the video sequence;
   - **paried:** 0 is no paired attack follows; 1 means has attack followed. Note that pair means the cloest attack bout happening after the investigation, same for investigation;
   - **start_frame:** the start_frame of the investigation bout in its corresponding video sequence, frame start from0;
   - **end_frame:** the end_frame of the investigation bout in its corresponding video sequence, frame start from 0;
   - **duration:** = end_frame - start_frame;
   - **mean speed:** linear speed of the center of mass(mean x,y position of all keypoints)
   - **mean acc:** linear acc of the center of mass;
   - **mean_rotation:** the mean angle of nose-center of mass-tail base;
   - **mean_rotation_speed**: the mean angle change of nose-center of mass-tail base;
3.  Step: clean and save the data

In [1]:
#import some packages
import numpy as np
import pandas as pd
import os

In [3]:
#load the data
#group frames by video name, and do the feature extraction
df = pd.read_csv(r"G:\My Drive\100-PhD Study\110-Courses\2025Summer_Neuromatch\processed_data\calms21_task_data.csv") #should name by your own path

In [4]:
print("Number of unique sequences:", df['sequence'].unique())


Number of unique sequences: ['task1/train/mouse001_task1_annotator1'
 'task1/train/mouse002_task1_annotator1'
 'task1/train/mouse003_task1_annotator1'
 'task1/train/mouse004_task1_annotator1'
 'task1/train/mouse005_task1_annotator1'
 'task1/train/mouse006_task1_annotator1'
 'task1/train/mouse007_task1_annotator1'
 'task1/train/mouse008_task1_annotator1'
 'task1/train/mouse009_task1_annotator1'
 'task1/train/mouse010_task1_annotator1'
 'task1/train/mouse011_task1_annotator1'
 'task1/train/mouse012_task1_annotator1'
 'task1/train/mouse013_task1_annotator1'
 'task1/train/mouse014_task1_annotator1'
 'task1/train/mouse015_task1_annotator1'
 'task1/train/mouse016_task1_annotator1'
 'task1/train/mouse017_task1_annotator1'
 'task1/train/mouse018_task1_annotator1'
 'task1/train/mouse019_task1_annotator1'
 'task1/train/mouse020_task1_annotator1'
 'task1/train/mouse021_task1_annotator1'
 'task1/train/mouse022_task1_annotator1'
 'task1/train/mouse023_task1_annotator1'
 'task1/train/mouse024_task1_

In [37]:
import numpy as np
import pandas as pd

def find_I_A_all(sequence_df):
    """
    this if function is to extract all the investigations and attacks of all video sequences
    input: sequence_df, dataframe of the video data, each row is a single frame, each column is its associated information
    """
    bouts = []
    current_behavior = None
    current_start = None
    investigation_ID = int(0)
    attack_ID = int(0)

    for i, row in sequence_df.iterrows():
        label = row['label']

        if current_behavior is None:
            current_behavior = label
            current_start = row['frame']
            continue

        if label != current_behavior:
            end_frame = row['frame'] - 1
            bout = {
                'sequence': row['sequence'],
                'label': current_behavior,
                'start_frame': current_start,
                'end_frame': end_frame,
                'duration': end_frame - current_start + 1,
            }

            if current_behavior == 1:
                bout['investigation_ID'] = investigation_ID
                bout['attack_ID'] = None
                investigation_ID += 1
            elif current_behavior == 0:
                bout['attack_ID'] = attack_ID
                bout['investigation_ID'] = None
                attack_ID += 1

            bouts.append(bout)
            current_behavior = label
            current_start = row['frame']

    # Add the final bout
    if current_behavior is not None and current_start is not None:
        end_frame = sequence_df.iloc[-1]['frame']
        bout = {
            'sequence': row['sequence'],
            'label': current_behavior,
            'start_frame': current_start,
            'end_frame': end_frame,
            'duration': end_frame - current_start + 1,
        }

        if current_behavior == 1:
            bout['investigation_ID'] = investigation_ID
            bout['attack_ID'] = None
        elif current_behavior == 0:
            bout['attack_ID'] = attack_ID
            bout['investigation_ID'] = None

        bouts.append(bout)

    return pd.DataFrame(bouts)




def find_I_A_pair_and_features(sequence_df, bout_df, dt=1/30):
    """
    For each investigation bout, find if it's followed by an attack (one-to-one mapping).
    Computes movement features during each investigation bout.
    Returns a DataFrame of investigation bouts with pairing and features.
    """
    result = []

    investigation_bouts = bout_df[bout_df['label'] == 1].copy().sort_values('start_frame').reset_index(drop=True)
    attack_bouts = bout_df[bout_df['label'] == 0].copy().sort_values('start_frame').reset_index(drop=True)

    used_investigations = set()

    # Map attacks to their most recent investigation
    attack_to_investigation = {}
    for _, atk in attack_bouts.iterrows():
        valid_inv = investigation_bouts[investigation_bouts['end_frame'] < atk['start_frame']]
        if not valid_inv.empty:
            last_inv = valid_inv.iloc[-1]
            inv_id = last_inv['investigation_ID']
            if inv_id not in used_investigations:
                attack_to_investigation[inv_id] = {
                    'attack_start': atk['start_frame'],
                    'attack_end': atk['end_frame'],
                    'attack_ID': atk.get('attack_ID', 0)
                }
                used_investigations.add(inv_id)

    for _, inv in investigation_bouts.iterrows():
        i_start, i_end = inv['start_frame'], inv['end_frame']
        inv_id = inv['investigation_ID']
        attack_info = attack_to_investigation.get(inv_id, None)

        if attack_info:
            pair_label = f"{int(inv_id)}_{int(attack_info['attack_ID'])}"
            paired = 1
        else:
            pair_label = f"{int(inv_id)}_0"
            paired = 0

        # Extract feature data from original sequence_df
        inv_data = sequence_df[(sequence_df['frame'] >= i_start) & (sequence_df['frame'] <= i_end)]
        
        #calculate the center-of-mass
        # Select all resident keypoints
        x_cols = [col for col in inv_data.columns if col.startswith('resident_x_')]
        y_cols = [col for col in inv_data.columns if col.startswith('resident_y_')]

        # Stack and compute the CoM
        com_x = inv_data[x_cols].mean(axis=1).values
        com_y = inv_data[y_cols].mean(axis=1).values

        # Motion features from CoM
        vx = np.insert(np.diff(com_x), 0, 0) / dt
        vy = np.insert(np.diff(com_y), 0, 0) / dt
        speed = np.sqrt(vx**2 + vy**2)
        acc = np.sqrt(np.insert(np.diff(vx), 0, 0)**2 + np.insert(np.diff(vy), 0, 0)**2) / dt

        # Vectors from COM to nose and COM to tail base
        vec_nose_x = inv_data['resident_x_nose'].values - com_x
        vec_nose_y = inv_data['resident_y_nose'].values - com_y
        vec_tail_x = inv_data['resident_x_tail_base'].values - com_x
        vec_tail_y = inv_data['resident_y_tail_base'].values - com_y

        # Angle between vectors (COM to nose) and (COM to tail base)
        dot = vec_nose_x * vec_tail_x + vec_nose_y * vec_tail_y
        norm_nose = np.sqrt(vec_nose_x**2 + vec_nose_y**2)
        norm_tail = np.sqrt(vec_tail_x**2 + vec_tail_y**2)

        # Clip to prevent floating point issues in arccos
        cos_theta = np.clip(dot / (norm_nose * norm_tail), -1.0, 1.0)
        rotation_angle = np.arccos(cos_theta)  # in radians

        # Rotation speed = frame-to-frame angular change
        rotation_speed = np.abs(np.insert(np.diff(rotation_angle), 0, 0)) / dt

        acc = np.sqrt(np.insert(np.diff(vx), 0, 0)**2 + np.insert(np.diff(vy), 0, 0)**2) / dt

        result.append({
            'sequence': inv['sequence'],
            'investigation_ID': int(inv_id),
            'pair_label': pair_label,
            'paired': paired,
            'start_frame': i_start,
            'end_frame': i_end,
            'duration': inv['duration'],
            'mean_speed': np.mean(speed),
            'mean_acc': np.mean(acc),
            'mean_rotation': np.mean(rotation_angle),
            'mean_rotation_speed': np.mean(rotation_speed)
        })

    return pd.DataFrame(result)


In [38]:
#example usage
all_investigations = []

for seq_name in df['sequence'].unique():
    seq_df = df[df['sequence'] == seq_name].sort_values('frame')
    bout_df = find_I_A_all(seq_df)
    inv_bouts_with_features = find_I_A_pair_and_features(seq_df, bout_df)
    all_investigations.append(inv_bouts_with_features)

final_df = pd.concat(all_investigations, ignore_index=True)


In [39]:
final_df

Unnamed: 0,sequence,investigation_ID,pair_label,paired,start_frame,end_frame,duration,mean_speed,mean_acc,mean_rotation,mean_rotation_speed
0,task1/train/mouse001_task1_annotator1,0,0_0,0,1,126,126,198.598475,3312.921155,2.861946,1.435074
1,task1/train/mouse001_task1_annotator1,1,1_0,0,136,633,498,190.933311,3225.591195,2.924218,1.239495
2,task1/train/mouse001_task1_annotator1,2,2_0,0,659,714,56,282.668019,4636.255526,2.938320,1.358593
3,task1/train/mouse001_task1_annotator1,3,3_0,0,736,898,163,144.627995,3110.452149,3.002670,1.198722
4,task1/train/mouse001_task1_annotator1,4,4_0,0,944,1045,102,85.162713,2527.840394,2.714141,1.095686
...,...,...,...,...,...,...,...,...,...,...,...
2874,task1/test/mouse089_task1_annotator1,42,42_0,0,10231,10236,6,78.476781,3517.480930,3.126484,0.283671
2875,task1/test/mouse089_task1_annotator1,43,43_0,0,10336,10343,8,117.942193,2651.071566,2.808574,1.378124
2876,task1/test/mouse089_task1_annotator1,44,44_0,0,10513,10525,13,43.492372,1546.329159,2.936758,0.933984
2877,task1/test/mouse089_task1_annotator1,45,45_0,0,10843,10986,144,63.153081,1917.414403,3.040754,0.547662


In [40]:
final_df.to_csv('final_df.csv', index=False, header=True)


In [41]:

np.save('final_df.npy', final_df.to_dict('records'))  # saves as list of dicts


In [42]:
final_df.to_pickle('final_df.pkl')
