Just for cleaning a dataset

In [2]:
import pandas as pd

# Define the function based on the provided keypoint indices
def keypoint_to_face_part(index):
    if 1 <= index <= 33:
        return 0  # Do not load in df
    elif 34 <= index <= 42:
        return "Right_brow"
    elif 43 <= index <= 51:
        return "Left_brow"
    elif 52 <= index <= 66:
        if 56 <= index <= 66:
            return 0  # Do not load in df
        return "Nose"
    elif 67 <= index <= 75:
        return "Right_Eye"
    elif 76 <= index <= 84:
        return "Left_Eye"
    elif 85 <= index <= 104:
        return "Mouth"
    elif index == 105 or index == 106:
        return 0  # Do not load in df

# Load the CSV file into a DataFrame
csv_file_path = "/workspaces/wiggle-face/data/panda_data/keypoints_clean_PANDA2_dataset.csv"
df = pd.read_csv(csv_file_path)

# Filter out the rows with the specified keypoint indices
df['face_part'] = df['keypoint_index'].apply(keypoint_to_face_part)
filtered_df = df[df['face_part'] != 0]

# Drop the 'face_part' column as it's no longer needed
filtered_df = filtered_df.drop(columns=['face_part'])

# Save the filtered DataFrame back to a CSV file if needed
filtered_csv_file_path = "/workspaces/wiggle-face/data/panda_data/panda_new/keypoints_clean_PANDA2_2_dataset.csv"
filtered_df.to_csv(filtered_csv_file_path, index=False)

print("Filtering complete. The filtered CSV file has been saved.")


Filtering complete. The filtered CSV file has been saved.


Actual Loading and Filtering

In [2]:
# imports
import os
import json
import pandas as pd
import pandas as pd
import numpy as np
import gc
from scipy.ndimage import median_filter
from concurrent.futures import ProcessPoolExecutor, as_completed

In [3]:
class KeypointsDataset:
    def __init__(self, json_dir, batch_size=200000):
        self.json_dir = json_dir
        self.json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
        self.batch_size = batch_size

    def keypoint_to_face_part(self, index):
        if 1 <= index <= 33:
            return 0  # Do not load in df
        elif 34 <= index <= 42:
            return "Right_brow"
        elif 43 <= index <= 51:
            return "Left_brow"
        elif 52 <= index <= 66:
            if 56 <= index <= 66:
                return 0  # Do not load in df
            return "Nose"
        elif 67 <= index <= 75:
            return "Right_Eye"
        elif 76 <= index <= 84:
            return "Left_Eye"
        elif 85 <= index <= 104:
            return "Mouth"
        elif index == 105 or index == 106:
            return 0  # Do not load in df

    def apply_filters(self, df):
        # Apply mean filter across frames for each keypoint index
        df['mean_keypoint_x'] = df.groupby('keypoint_index')['keypoint_x'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
        df['mean_keypoint_y'] = df.groupby('keypoint_index')['keypoint_y'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
        
        # Apply median filter on the mean-filtered data across frames for each keypoint index
        df['median_keypoint_x'] = df.groupby('keypoint_index')['mean_keypoint_x'].transform(lambda x: median_filter(x, size=3))
        df['median_keypoint_y'] = df.groupby('keypoint_index')['mean_keypoint_y'].transform(lambda x: median_filter(x, size=3))
        
        return df

    def load_data(self):
        data = []
        for json_file in self.json_files:
            video_id = os.path.splitext(json_file)[0]
            parts = video_id.split('_')
            
            infant_id = parts[0]
            if parts[6] == 'week':
                age = int(parts[4])  
            else :
                age = int(parts[4]) * 4

            print(f"Processing infant ID: {infant_id}, age in weeks: {age}")

            with open(os.path.join(self.json_dir, json_file), 'r') as f:
                frames = json.load(f)
                frame_data_list = []

                for frame_index, frame_data in enumerate(frames):
                    frame_id = frame_data["frame_id"]
                    for instance_index, instance in enumerate(frame_data["instances"]):
                        keypoints = instance["keypoints"]
                        keypoint_scores = instance["keypoint_scores"]

                        for idx, (kp, score) in enumerate(zip(keypoints, keypoint_scores)):
                            face_part = self.keypoint_to_face_part(idx + 1)
                            if face_part != 0:
                                frame_data_list.append({
                                    "infant_id": int(infant_id),
                                    "age_weeks": age,
                                    "frame_id": frame_id,
                                    "keypoint": tuple(kp),
                                    "keypoint_score": score,
                                    "face_part": face_part,
                                    "keypoint_index": idx + 1,
                                    "keypoint_x": kp[0],
                                    "keypoint_y": kp[1]
                                })

                # Convert to DataFrame for processing
                df = pd.DataFrame(frame_data_list)

                # Apply filters to smooth the keypoints
                df = self.apply_filters(df)

                # Remove frames with average keypoint confidence under 0.8
                for frame_id, frame_group in df.groupby('frame_id'):
                    if frame_group['keypoint_score'].all() >= 0.75:
                        for _, row in frame_group.iterrows():
                            data.append({
                                "infant_id": row["infant_id"],
                                "age_weeks": row["age_weeks"],
                                "frame_id": row["frame_id"],
                                "keypoint": (row['median_keypoint_x'], row['median_keypoint_y']),
                                "keypoint_score": row["keypoint_score"],
                                "face_part": row["face_part"],
                                "keypoint_index": row["keypoint_index"]
                            })

                            if len(data) >= self.batch_size:
                                yield data
                                data = []
                                gc.collect()

        if data:
            yield data

In [None]:
json_dir = r'/workspaces/wiggle-face/data-ioana/PANDA2/face/annotations'
dataset = KeypointsDataset(json_dir)
df_list = []

try:
    for batch_data in dataset.load_data():
        batch_df = pd.DataFrame(batch_data)
        df_list.append(batch_df)
        gc.collect()  # Collect garbage to free memory

    # Concatenate all collected DataFrames at once
    if df_list:
        df = pd.concat(df_list, ignore_index=True)
        print(f"Final DataFrame created with {len(df)} records")
    else:
        print("No data processed. DataFrame is empty.")
        df = pd.DataFrame()  # Return an empty DataFrame if no data was processed

except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Main script
json_dir = r'/workspaces/wiggle-face/data-ioana/PANDA2/face/annotations'
dataset = KeypointsDataset(json_dir)
df_list = []

for batch_data in dataset.load_data():
    batch_df = pd.DataFrame(batch_data)
    df_list.append(batch_df)

# Concatenate all collected DataFrames at once
df = pd.concat(df_list, ignore_index=True)

In [None]:
df = df.sort_values(by=['infant_id','age_weeks','frame_id']).reset_index(drop=True)
print(df)

In [None]:
## Identify continuous segments that are at least 10 frames long (roughly 1/3 second) 
df = df.sort_values(by=['infant_id', 'age_weeks', 'keypoint_index', 'frame_id'])
# identify continuous segments for each 'infant_id' and 'keypoint_index'

df['frame_diff'] = df.groupby(['infant_id','age_weeks', 'keypoint_index'])['frame_id'].diff().fillna(1)
df['block'] = (df['frame_diff'] != 1).cumsum()
blocks = df.groupby(['infant_id','age_weeks','keypoint_index', 'block']).filter(lambda x: len(x) >= 10)
blocks = blocks.drop(columns=['block', 'frame_diff'])
blocks = blocks.rename(columns={"processed_keypoint":"keypoint"})
blocks = blocks[['infant_id', 'age_weeks', 'frame_id', 'keypoint_index', 'keypoint', 'keypoint_score', 'face_part']]
blocks = blocks.sort_values(by=['infant_id','age_weeks','frame_id']).reset_index(drop = True)
blocks.head()

In [None]:
print(blocks)

In [None]:
# save as csv files to be processed by "Data_Inspection_Emotion_Analysis"
blocks.to_csv('keypoints_clean_PANDA2_2_dataset.csv', index=False)
print("Dataset saved successfully as keypoints_clean_PANDA2_2_dataset.csv")