In [26]:
# imports
import os
import json
import pandas as pd
import pandas as pd
import numpy as np
import gc
from scipy.ndimage import median_filter

In [4]:
# class KeypointsDataset:
#     def __init__(self, json_dir, batch_size=200000):
#         self.json_dir = json_dir
#         self.json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
#         self.batch_size = batch_size

#     def keypoint_to_face_part(self, index):
#         if 1 <= index <= 33:
#             return 0  # Do not load in df
#         elif 34 <= index <= 42:
#             return "Right_brow"
#         elif 43 <= index <= 51:
#             return "Left_brow"
#         elif 52 <= index <= 66:
#             if 56 <= index <= 66:
#                 return 0  # Do not load in df
#             return "Nose"
#         elif 67 <= index <= 75:
#             return "Right_Eye"
#         elif 76 <= index <= 84:
#             return "Left_Eye"
#         elif 85 <= index <= 104:
#             return "Mouth"
#         elif index == 105 or index == 106:
#             return 0  # Do not load in df

#     def load_data(self):
#         data = []
#         for json_file in self.json_files:
#             video_id = os.path.splitext(json_file)[0]
#             parts = video_id.split('_')
#             date = '_'.join(parts[1:4])
#             infant_id = parts[5]  
#             camera = parts[6]

#             cam = ord(camera[3:4]) - ord('0')
#             print(f"Processing infant ID: {infant_id} on date: {date}, camera: {cam}")
#             with open(os.path.join(self.json_dir, json_file), 'r') as f:
#                 frames = json.load(f)
#                 for frame_index, frame_data in enumerate(frames):
#                     frame_id = frame_data["frame_id"]
#                     for instance_index, instance in enumerate(frame_data["instances"]):
#                         keypoints = instance["keypoints"]
#                         keypoint_scores = instance["keypoint_scores"]

#                         for idx, (kp, score) in enumerate(zip(keypoints, keypoint_scores)):
#                             face_part = self.keypoint_to_face_part(idx + 1)
#                             if face_part != 0:
#                                 data.append({
#                                     "infant_id": int(infant_id),
#                                     "date": date,
#                                     "cam": cam,
#                                     "frame_id": frame_id,
#                                     "keypoint": tuple(kp),
#                                     "keypoint_score": score,
#                                     "face_part": face_part,
#                                     "keypoint_index": idx + 1    
#                                 })

#                                 if len(data) >= self.batch_size:
#                                     yield data
#                                     data = []
#                                     gc.collect()

#         if data:
#             yield data

In [27]:
class KeypointsDataset:
    def __init__(self, json_dir, batch_size=200000):
        self.json_dir = json_dir
        self.json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
        self.batch_size = batch_size

    def keypoint_to_face_part(self, index):
        if 1 <= index <= 33:
            return 0  # Do not load in df
        elif 34 <= index <= 42:
            return "Right_brow"
        elif 43 <= index <= 51:
            return "Left_brow"
        elif 52 <= index <= 66:
            if 56 <= index <= 66:
                return 0  # Do not load in df
            return "Nose"
        elif 67 <= index <= 75:
            return "Right_Eye"
        elif 76 <= index <= 84:
            return "Left_Eye"
        elif 85 <= index <= 104:
            return "Mouth"
        elif index == 105 or index == 106:
            return 0  # Do not load in df

    def apply_filters(self, df):
        # Apply mean filter across frames for each keypoint index
        df['mean_keypoint_x'] = df.groupby('keypoint_index')['keypoint_x'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
        df['mean_keypoint_y'] = df.groupby('keypoint_index')['keypoint_y'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
        
        # Apply median filter on the mean-filtered data across frames for each keypoint index
        df['median_keypoint_x'] = df.groupby('keypoint_index')['mean_keypoint_x'].transform(lambda x: median_filter(x, size=3))
        df['median_keypoint_y'] = df.groupby('keypoint_index')['mean_keypoint_y'].transform(lambda x: median_filter(x, size=3))
        
        return df

    def load_data(self):
        data = []
        for json_file in self.json_files:
            video_id = os.path.splitext(json_file)[0]
            parts = video_id.split('_')
            date = '_'.join(parts[1:4])
            infant_id = parts[5]  
            camera = parts[6]

            cam = ord(camera[3:4]) - ord('0')
            print(f"Processing infant ID: {infant_id} on date: {date}, camera: {cam}")
            with open(os.path.join(self.json_dir, json_file), 'r') as f:
                frames = json.load(f)
                frame_data_list = []

                for frame_index, frame_data in enumerate(frames):
                    frame_id = frame_data["frame_id"]
                    for instance_index, instance in enumerate(frame_data["instances"]):
                        keypoints = instance["keypoints"]
                        keypoint_scores = instance["keypoint_scores"]

                        for idx, (kp, score) in enumerate(zip(keypoints, keypoint_scores)):
                            face_part = self.keypoint_to_face_part(idx + 1)
                            if face_part != 0:
                                frame_data_list.append({
                                    "infant_id": int(infant_id),
                                    "date": date,
                                    "cam": cam,
                                    "frame_id": frame_id,
                                    "keypoint": tuple(kp),
                                    "keypoint_score": score,
                                    "face_part": face_part,
                                    "keypoint_index": idx + 1,
                                    "keypoint_x": kp[0],
                                    "keypoint_y": kp[1]
                                })

                # Convert to DataFrame for processing
                df = pd.DataFrame(frame_data_list)

                # Apply filters to smooth the keypoints
                df = self.apply_filters(df)

                # Remove frames with average keypoint confidence under 0.8
                for frame_id, frame_group in df.groupby('frame_id'):
                    if frame_group['keypoint_score'].mean() >= 0.8:
                        for _, row in frame_group.iterrows():
                            data.append({
                                "infant_id": row["infant_id"],
                                "date": row["date"],
                                "cam": row["cam"],
                                "frame_id": row["frame_id"],
                                "keypoint": (row['median_keypoint_x'], row['median_keypoint_y']),
                                "keypoint_score": row["keypoint_score"],
                                "face_part": row["face_part"],
                                "keypoint_index": row["keypoint_index"]
                            })

                            if len(data) >= self.batch_size:
                                yield data
                                data = []
                                gc.collect()

        if data:
            yield data

In [28]:
# working with PANDA 3/Yt data
json_dir = r'/workspaces/wiggle-face/data-ioana/PANDA3/face/annotations' 
dataset = KeypointsDataset(json_dir)
df = pd.DataFrame()

for batch_data in dataset.load_data():
    batch_df = pd.DataFrame(batch_data)
    df = pd.concat([df, batch_df], ignore_index=True)

Processing infant ID: 218 on date: 2022_11_30, camera: 3
Processing infant ID: 203 on date: 2022_02_04, camera: 3
Processing infant ID: 211 on date: 2022_09_08, camera: 3
Processing infant ID: 228 on date: 2023_07_07, camera: 3
Processing infant ID: 225 on date: 2022_12_06, camera: 3
Processing infant ID: 215 on date: 2022_11_04, camera: 3
Processing infant ID: 201 on date: 2021_10_07, camera: 3
Processing infant ID: 201 on date: 2021_07_26, camera: 3
Processing infant ID: 221 on date: 2022_12_19, camera: 3
Processing infant ID: 207 on date: 2022_02_14, camera: 3
Processing infant ID: 214 on date: 2022_09_12, camera: 3
Processing infant ID: 208 on date: 2022_07_11, camera: 3
Processing infant ID: 215 on date: 2023_03_03, camera: 3
Processing infant ID: 214 on date: 2022_11_07, camera: 3
Processing infant ID: 203 on date: 2021_11_05, camera: 3
Processing infant ID: 202 on date: 2021_08_17, camera: 3
Processing infant ID: 212 on date: 2022_09_29, camera: 3
Processing infant ID: 207 on da

In [29]:
df.to_csv('preliminary_2.csv', index=False)

In [30]:
df.head()

Unnamed: 0,infant_id,date,cam,frame_id,keypoint,keypoint_score,face_part,keypoint_index
0,218,2022_11_30,3,367,"(979.2205200195312, 329.4839375813802)",0.726164,Right_brow,34
1,218,2022_11_30,3,367,"(982.0897827148438, 325.85010782877606)",0.693672,Right_brow,35
2,218,2022_11_30,3,367,"(985.1497802734375, 324.98931884765625)",0.684873,Right_brow,36
3,218,2022_11_30,3,367,"(988.4008585611979, 324.8057352701823)",0.692774,Right_brow,37
4,218,2022_11_30,3,367,"(991.2701416015625, 325.1875)",0.695921,Right_brow,38


In [31]:
df = df.sort_values(by=['infant_id','date','cam','frame_id']).reset_index(drop=True)
print(df)

          infant_id        date  cam  frame_id  \
0               201  2021_07_26    3         1   
1               201  2021_07_26    3         1   
2               201  2021_07_26    3         1   
3               201  2021_07_26    3         1   
4               201  2021_07_26    3         1   
...             ...         ...  ...       ...   
29678455        247  2024_06_06    3      7745   
29678456        247  2024_06_06    3      7745   
29678457        247  2024_06_06    3      7745   
29678458        247  2024_06_06    3      7745   
29678459        247  2024_06_06    3      7745   

                                         keypoint  keypoint_score   face_part  \
0          (661.7792358398438, 850.8797200520834)        0.815701  Right_brow   
1          (665.3018493652344, 845.0055338541666)        0.805013  Right_brow   
2          (670.5080973307291, 840.4264831542969)        0.834134  Right_brow   
3          (676.5781046549479, 837.4498291015625)        0.844086  Right_br

In [None]:
## Identify continuous segments that are at least 10 frames long (roughly 1/3 second) 
df = df.sort_values(by=['infant_id', 'date', 'cam', 'keypoint_index', 'frame_id'])
# identify continuous segments for each 'infant_id' and 'keypoint_index'

df['frame_diff'] = df.groupby(['infant_id','date', 'cam', 'keypoint_index'])['frame_id'].diff().fillna(1)
df['block'] = (df['frame_diff'] != 1).cumsum()
blocks = df.groupby(['infant_id','date','cam','keypoint_index', 'block']).filter(lambda x: len(x) >= 10)


In [33]:
blocks = blocks.drop(columns=['block', 'frame_diff'])
blocks = blocks.rename(columns={"processed_keypoint":"keypoint"})
blocks = blocks[['infant_id', 'date', 'cam', 'frame_id', 'keypoint_index', 'keypoint', 'keypoint_score', 'face_part']]
blocks = blocks.sort_values(by=['infant_id','date','cam','frame_id']).reset_index(drop = True)
blocks.head()

Unnamed: 0,infant_id,date,cam,frame_id,keypoint_index,keypoint,keypoint_score,face_part
0,201,2021_07_26,3,1,34,"(661.7792358398438, 850.8797200520834)",0.815701,Right_brow
1,201,2021_07_26,3,1,35,"(665.3018493652344, 845.0055338541666)",0.805013,Right_brow
2,201,2021_07_26,3,1,36,"(670.5080973307291, 840.4264831542969)",0.834134,Right_brow
3,201,2021_07_26,3,1,37,"(676.5781046549479, 837.4498291015625)",0.844086,Right_brow
4,201,2021_07_26,3,1,38,"(682.5829874674479, 835.72900390625)",0.809305,Right_brow


In [35]:
# save as csv files to be processed by "Data_Inspection_Emotion_Analysis"
blocks.to_csv('keypoints_clean_PANDA3_2_dataset.csv', index=False)
print("Dataset saved successfully as keypoints_clean_PANDA3_2_dataset.csv")

Dataset saved successfully as keypoints_clean_PANDA3_2_dataset.csv


: 

In [None]:
#Old code below

In [9]:
def process_infant_data(group):
    # Ensure the group is sorted by frame_id
    group = group.sort_values('frame_id')
    
    # Convert 'keypoint' to separate columns for x and y coordinates
    group[['keypoint_x', 'keypoint_y']] = pd.DataFrame(group['keypoint'].tolist(), index=group.index)
    
    # Apply mean filter frame-to-frame
    group['mean_keypoint_x'] = group.groupby('keypoint_index')['keypoint_x'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
    group['mean_keypoint_y'] = group.groupby('keypoint_index')['keypoint_y'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
    
    # Apply median filter on the mean-filtered data frame-to-frame
    group['median_keypoint_x'] = group.groupby('keypoint_index')['mean_keypoint_x'].transform(lambda x: median_filter(x, size=3))
    group['median_keypoint_y'] = group.groupby('keypoint_index')['mean_keypoint_y'].transform(lambda x: median_filter(x, size=3))
    
    # Combine the median-filtered coordinates back into a tuple
    group['processed_keypoint'] = list(zip(group['median_keypoint_x'], group['median_keypoint_y']))
    
    # Drop intermediate columns
    group.drop(columns=['keypoint_x', 'keypoint_y', 'mean_keypoint_x', 'mean_keypoint_y', 'median_keypoint_x', 'median_keypoint_y'], inplace=True)
    
    return group

# Apply the process_infant_data function to each group
df_processed = df.groupby(['infant_id', 'date', 'cam'], group_keys=False).apply(process_infant_data).reset_index(drop=True)


In [21]:
print("Original DataFrame:")
df.head(80)

Original DataFrame:


Unnamed: 0,infant_id,date,cam,frame_id,keypoint,keypoint_score,face_part,keypoint_index
0,3,2021_07_21,3,0,"(1027.427734375, 229.10992431640625)",0.751885,Right_brow,34
1,3,2021_07_21,3,0,"(1039.9215087890625, 220.3180389404297)",0.762259,Right_brow,35
2,3,2021_07_21,3,0,"(1055.6544189453125, 218.92984008789062)",0.720471,Right_brow,36
3,3,2021_07_21,3,0,"(1070.9244384765625, 221.24349975585938)",0.673808,Right_brow,37
4,3,2021_07_21,3,0,"(1086.6573486328125, 224.94534301757812)",0.599189,Right_brow,38
...,...,...,...,...,...,...,...,...
75,3,2021_07_21,3,1,"(1154.517822265625, 219.02101135253906)",0.793231,Left_brow,49
76,3,2021_07_21,3,1,"(1140.180419921875, 221.79598999023438)",0.744120,Left_brow,50
77,3,2021_07_21,3,1,"(1125.8431396484375, 225.49594116210938)",0.734699,Left_brow,51
78,3,2021_07_21,3,1,"(1104.568359375, 237.98330688476562)",0.817636,Nose,52


In [22]:
print("\nProcessed DataFrame:")
df_processed =  df_processed.drop(columns="keypoint")
df_processed = df_processed.sort_values(by = ["infant_id","date","cam","frame_id", 'keypoint_index']).reset_index(inplace= True)
df_processed.head(80)


Processed DataFrame:


Unnamed: 0,index,infant_id,date,cam,frame_id,keypoint_score,face_part,keypoint_index,processed_keypoint
0,0,3,2021_07_21,3,0,0.751885,Right_brow,34,"(1027.427734375, 229.10992431640625)"
1,46,3,2021_07_21,3,0,0.762259,Right_brow,35,"(1039.9215087890625, 220.3180389404297)"
2,33,3,2021_07_21,3,0,0.720471,Right_brow,36,"(1055.6544189453125, 218.92984008789062)"
3,34,3,2021_07_21,3,0,0.673808,Right_brow,37,"(1070.9244384765625, 221.24349975585938)"
4,35,3,2021_07_21,3,0,0.599189,Right_brow,38,"(1086.6573486328125, 224.94534301757812)"
...,...,...,...,...,...,...,...,...,...
75,106,3,2021_07_21,3,1,0.793231,Left_brow,49,"(1154.1356201171875, 219.206787109375)"
76,118,3,2021_07_21,3,1,0.744120,Left_brow,50,"(1139.7945556640625, 221.98247528076172)"
77,117,3,2021_07_21,3,1,0.734699,Left_brow,51,"(1125.2222290039062, 225.91474151611328)"
78,116,3,2021_07_21,3,1,0.817636,Nose,52,"(1104.40478515625, 237.90182495117188)"


In [18]:
## Find all frames where all keypoints are visible & average confidence above 0.8

grouped = df_processed .groupby(['infant_id', 'date','cam','frame_id']).agg(average_confidence=('keypoint_score', 'mean')).reset_index()
filtered = grouped[grouped['average_confidence'] > 0.8]

# merge back with the original dataframe

filtered_frames = df_processed.merge(filtered, on=['infant_id', 'date','cam','frame_id'], how='inner')
filtered_frames = filtered_frames.sort_values(by=['infant_id', 'date', 'cam','frame_id'])
filtered_frames = filtered_frames.reset_index(drop = True)
print(filtered_frames)

            index  infant_id        date  cam  frame_id  keypoint_score  \
0               0          3  2021_07_21    3         0        0.751885   
1              46          3  2021_07_21    3         0        0.762259   
2              33          3  2021_07_21    3         0        0.720471   
3              34          3  2021_07_21    3         0        0.673808   
4              35          3  2021_07_21    3         0        0.599189   
...           ...        ...         ...  ...       ...             ...   
8509915  25711582         86  2024_03_20    3      7487        0.825212   
8509916  25711583         86  2024_03_20    3      7487        0.749815   
8509917  25711584         86  2024_03_20    3      7487        0.826427   
8509918  25711585         86  2024_03_20    3      7487        0.821910   
8509919  25711586         86  2024_03_20    3      7487        0.852507   

          face_part  keypoint_index                        processed_keypoint  \
0        Right_bro

In [19]:
## Identify continuous segments that are at least 10 frames long (roughly 1/3 second) 
filtered_frames = filtered_frames.sort_values(by=['infant_id', 'date', 'cam', 'keypoint_index', 'frame_id'])
# identify continuous segments for each 'infant_id' and 'keypoint_index'

filtered_frames['frame_diff'] = filtered_frames.groupby(['infant_id','date', 'cam', 'keypoint_index'])['frame_id'].diff().fillna(1)
filtered_frames['block'] = (filtered_frames['frame_diff'] != 1).cumsum()
blocks = filtered_frames.groupby(['infant_id','date','cam','keypoint_index', 'block']).filter(lambda x: len(x) >= 10)
blocks = blocks.drop(columns=['block', 'frame_diff', 'average_confidence'])
blocks = blocks.rename(columns={"processed_keypoint":"keypoint"})
blocks = blocks[['infant_id', 'date', 'cam', 'frame_id', 'keypoint_index', 'keypoint', 'keypoint_score', 'face_part']]
blocks = blocks.sort_values(by=['infant_id','date','cam','frame_id']).reset_index(drop = True)
blocks.head()

Unnamed: 0,infant_id,date,cam,frame_id,keypoint_index,keypoint,keypoint_score,face_part
0,3,2021_07_21,3,0,34,"(1027.427734375, 229.10992431640625)",0.751885,Right_brow
1,3,2021_07_21,3,0,35,"(1039.9215087890625, 220.3180389404297)",0.762259,Right_brow
2,3,2021_07_21,3,0,36,"(1055.6544189453125, 218.92984008789062)",0.720471,Right_brow
3,3,2021_07_21,3,0,37,"(1070.9244384765625, 221.24349975585938)",0.673808,Right_brow
4,3,2021_07_21,3,0,38,"(1086.6573486328125, 224.94534301757812)",0.599189,Right_brow


In [20]:
print(blocks)

         infant_id        date  cam  frame_id  keypoint_index  \
0                3  2021_07_21    3         0              34   
1                3  2021_07_21    3         0              35   
2                3  2021_07_21    3         0              36   
3                3  2021_07_21    3         0              37   
4                3  2021_07_21    3         0              38   
...            ...         ...  ...       ...             ...   
6242455         83  2024_04_10    3      6887             100   
6242456         83  2024_04_10    3      6887             101   
6242457         83  2024_04_10    3      6887             102   
6242458         83  2024_04_10    3      6887             103   
6242459         83  2024_04_10    3      6887             104   

                                         keypoint  keypoint_score   face_part  
0            (1027.427734375, 229.10992431640625)        0.751885  Right_brow  
1         (1039.9215087890625, 220.3180389404297)        0.

In [23]:
# save as csv files to be processed by "Data_Inspection_Emotion_Analysis"
blocks.to_csv('keypoints_clean_PANDA1_2_dataset.csv', index=False)
print("Dataset saved successfully as keypoints_clean_PANDA1_2_dataset.csv")

Dataset saved successfully as keypoints_clean_PANDA1_2_dataset.csv
