In [1]:
import os
import glob
import pandas as pd
import cv2
import re

#Function for counting and displaying the amount of frames per behavior
def count_frames_by_behavior(directory, frame_rate=25):
    #Gather all CSV files in the directory
    csv_files = glob.glob(os.path.join(directory, "*.csv"))
    
    #Concatenate all CSV files
    df_list = []
    for file in csv_files:
        try:
            df = pd.read_csv(file, encoding='utf-8')  # Try UTF-8 first
        except UnicodeDecodeError:
            df = pd.read_csv(file, encoding='ISO-8859-1')  # Fallback to ISO-8859-1 if needed
        df_list.append(df)
    full_df = pd.concat(df_list, ignore_index=True)
    
    #Create a new column for the merged behavior categories
    full_df['Merged Behavior'] = full_df['Behavior'].apply(lambda x: 'INSC_merged' if x != 'NOBR' else x)
    
    #Filter rows for 'START' and 'STOP' behaviors
    start_behaviors = full_df[full_df['Behavior type'] == 'START']
    stop_behaviors = full_df[full_df['Behavior type'] == 'STOP']
    
    #Merge start and stop times by 'Observation id' and 'Behavior'
    merged_behaviors = pd.merge(
        start_behaviors[['Observation id', 'Behavior', 'Merged Behavior', 'Time']],
        stop_behaviors[['Observation id', 'Behavior', 'Merged Behavior', 'Time']],
        on=['Observation id', 'Behavior', 'Merged Behavior'],
        suffixes=('_start', '_stop')
    )
    
    #Calculate duration in seconds and convert to frames
    merged_behaviors['Duration_sec'] = merged_behaviors['Time_stop'] - merged_behaviors['Time_start']
    merged_behaviors['Frame_count'] = (merged_behaviors['Duration_sec'] * frame_rate).astype(int)
    
    #Sum frame counts for each original behavior and for merged behaviors
    frame_counts_per_behavior = merged_behaviors.groupby('Behavior')['Frame_count'].sum()
    frame_counts_per_merged_behavior = merged_behaviors.groupby('Merged Behavior')['Frame_count'].sum()
    
    #Output results for both original and merged behaviors
    print("Frame counts for each original behavior:")
    for behavior, count in frame_counts_per_behavior.items():
        print(f"Behavior '{behavior}' has {count} frames.")
    
    print("\nFrame counts for merged behavior categories:")
    for merged_behavior, count in frame_counts_per_merged_behavior.items():
        print(f"Merged Behavior '{merged_behavior}' has {count} frames.")
    
    return frame_counts_per_behavior, frame_counts_per_merged_behavior

#Call the function
count_frames_by_behavior(r'C:\Users\Madison\Documents\Manual Scoring Results')

#Function for extracting frames based on the manually scored behavior timestamps
def extract_frames_by_behavior(video_dir, csv_dir, output_dir, frame_rate=25):
    #Gather CSV files and match them to video files based on numbering
    csv_files = glob.glob(os.path.join(csv_dir, "*.csv"))
    video_files = glob.glob(os.path.join(video_dir, "*.mp4"))  # Adjust if your video files have different extension
    
    # Sort files to match numbers easily
    csv_files.sort()
    video_files.sort()

    #Create output directories if they don't exist
    insc_dir = os.path.join(output_dir, "INSC_merged")
    nobr_dir = os.path.join(output_dir, "NOBR")
    os.makedirs(insc_dir, exist_ok=True)
    os.makedirs(nobr_dir, exist_ok=True)

    #Process each CSV and corresponding video file
    for csv_path, video_path in zip(csv_files, video_files):
        video_name = os.path.basename(video_path)
        csv_name = os.path.basename(csv_path)
        
        # Load CSV data and check for the necessary columns
        try:
            df = pd.read_csv(csv_path, encoding='utf-8')
        except UnicodeDecodeError:
            df = pd.read_csv(csv_path, encoding='ISO-8859-1')

        # Verify 'Time' and 'Behavior type' columns are in the CSV
        if 'Time' not in df.columns or 'Behavior type' not in df.columns:
            print(f"Error: Required columns 'Time' or 'Behavior type' not found in {csv_name}. Skipping this file.")
            continue

        # Create merged behavior category
        df['Merged Behavior'] = df['Behavior'].apply(lambda x: 'INSC_merged' if x != 'NOBR' else x)

        # Filter for start and stop times
        start_behaviors = df[df['Behavior type'] == 'START']
        stop_behaviors = df[df['Behavior type'] == 'STOP']

        # Merge start and stop times
        merged_behaviors = pd.merge(
            start_behaviors[['Behavior', 'Merged Behavior', 'Time']],
            stop_behaviors[['Behavior', 'Merged Behavior', 'Time']],
            on=['Behavior', 'Merged Behavior'],
            suffixes=('_start', '_stop')
        )

        # Ensure merged DataFrame has start and stop times
        if 'Time_start' not in merged_behaviors.columns or 'Time_stop' not in merged_behaviors.columns:
            print(f"Error: Merging failed for {csv_name}. Check if 'START' and 'STOP' behaviors are paired correctly.")
            continue

        # Open the video file
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # Get the current frame time in seconds
            frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
            current_time = frame_number / fps

            # Iterate over each behavior's start and stop times to extract frames
            for _, row in merged_behaviors.iterrows():
                start_time = row['Time_start']
                stop_time = row['Time_stop']
                behavior = row['Merged Behavior']
                
                # Check if current frame is within the behavior's time range
                if start_time <= current_time <= stop_time:
                    folder = insc_dir if behavior == 'INSC_merged' else nobr_dir
                    frame_path = os.path.join(folder, f"{video_name}_frame_{frame_number}.jpg")
                    cv2.imwrite(frame_path, frame)
                    break  # Break to avoid saving the same frame in multiple behaviors

        cap.release()

    print("Frames extracted and saved based on behavior timestamps and corresponding CSVs.")


#commented out to avoid running and uncomment when needed.
#extract_frames_by_behavior(r'E:\masked_videos', r'C:\Users\Madison\Documents\Manual Scoring Results', r'E:\Masked_vid_frames')

#Function to determine the number of unique recording events per category (NOBR and INSC_merged)
def count_unique_recordings(folder_path):
        # Extract unique recording numbers from frame filenames
        recording_ids = set()
        frame_files = glob.glob(os.path.join(folder_path, "*.jpg"))
        print(f"Found {len(frame_files)} frames in {folder_path}")  # Debug: Show number of frames found in folder
        for frame_file in frame_files:
            base_name = os.path.basename(frame_file)
            recording_id = "_".join(base_name.split("_")[:2])  # Should capture only 'video_#'
            recording_ids.add(recording_id)
            #print(f"Adding recording ID: {recording_id}")  # Debug: Show each unique ID added

        return recording_ids

    # Count and display unique recordings for each folder
insc_recordings = count_unique_recordings(r'E:\Masked_vid_frames\INSC_merged')
nobr_recordings = count_unique_recordings(r'E:\Masked_vid_frames\NOBR')
    
print(f"INSC_merged contains {len(insc_recordings)} unique recordings: {sorted(insc_recordings)}")
print(f"NOBR contains {len(nobr_recordings)} unique recordings: {sorted(nobr_recordings)}")


#next steps: splitting the dataset into testing, training, and validation datasets 

Frame counts for each original behavior:
Behavior 'ENTR' has 590 frames.
Behavior 'EXIT' has 2742 frames.
Behavior 'INSC' has 3340599 frames.
Behavior 'NOBR' has 14660 frames.
Behavior 'PERC' has 161926 frames.

Frame counts for merged behavior categories:
Merged Behavior 'INSC_merged' has 3505857 frames.
Merged Behavior 'NOBR' has 14660 frames.
Found 82511 frames in E:\Masked_vid_frames\INSC_merged
Found 1294 frames in E:\Masked_vid_frames\NOBR
INSC_merged contains 205 unique recordings: ['video_10', 'video_100', 'video_1000', 'video_1001', 'video_1002', 'video_1003', 'video_1004', 'video_1005', 'video_1006', 'video_1007', 'video_1008', 'video_1009', 'video_101', 'video_1010', 'video_1011', 'video_1012', 'video_1013', 'video_1014', 'video_1015', 'video_1016', 'video_1017', 'video_1018', 'video_1019', 'video_102', 'video_1020', 'video_1021', 'video_1022', 'video_1023', 'video_1024', 'video_1025', 'video_1026', 'video_1027', 'video_1028', 'video_1029', 'video_103', 'video_1030', 'video_