In [34]:
import numpy as np
import pandas as pd

In [35]:
EXCEL_FILE = "/orcd/data/satra/002/datasets/SAILS/data4analysis/Video Rating Data/SAILS_RATINGS_ALL_8.8.25.xlsx"

df = pd.read_excel(EXCEL_FILE)
print("Excel shape:", df.shape)
print(df.columns)

Excel shape: (3902, 42)
Index(['Coder', 'SourceFile', 'ID', 'FileName', 'Vid_duration', 'DOB',
       'Vid_date', 'Age_in_months', 'time_point', 'Context', 'Location',
       'Activity', 'Child_of_interest_clear', '#_adults', '#_children',
       '#_people_background', 'Interaction_with_child', '#_people_interacting',
       'Child_constrained', 'Constraint_type', 'Supports', 'Support_type',
       'Example_support_type', 'Gestures', 'Gesture_type', 'Vocalizations',
       'RMM', 'RMM_type', 'Response_to_name', 'Locomotion', 'Locomotion_type',
       'Grasping', 'Grasp_type', 'Body_Parts_Visible', 'Angle_of_Body',
       'Video_Quality_Child_Face_Visibility',
       'Video_Quality_Child_Body_Visibility',
       'Video_Quality_Child_Hand_Visibility', 'Video_Quality_Lighting',
       'Video_Quality_Resolution', 'Video_Quality_Motion', 'Notes'],
      dtype='object')


In [36]:
json_to_excel = {
    "Location":"Location",
    "context":"Context",
    "child_clear": "Child_of_interest_clear",
    "gestures": "Gestures",
    "gesture_type": "Gesture_type",
    "vocalizations": "Vocalizations",
    "rmm": "RMM",
    "rmm_type": "RMM_type",
    "locomotion": "Locomotion",
    "locomotion_type": "Locomotion_type",
    "grasping": "Grasping",
    "grasp_type": "Grasp_type",
    "lighting": "Video_Quality_Lighting",
    "resolution": "Video_Quality_Resolution",
    "motion": "Video_Quality_Motion",
    "Video_Quality_Child_Hand_Visibility": "Video_Quality_Child_Hand_Visibility",
    "video_quality_face": "Video_Quality_Child_Face_Visibility",
    "video_quality_body": "Video_Quality_Child_Body_Visibility",
    "num_adults": "#_adults",
    "num_children": "#_children", 
    "num_background": "#_people_background",
    "interaction_with_child": "Interaction_with_child",
    "num_interacting": "#_people_interacting",
    "child_constrained": "Child_constrained",
    "constraint_type": "Constraint_type",
    "supports": "Supports",
    "support_type": "Support_type",
    "body_parts_visible": "Body_Parts_Visible",
    "angle_of_body": "Angle_of_Body"
}

for json_key, excel_col in json_to_excel.items():
    if excel_col in df.columns:
        unique_vals = df[excel_col].dropna().unique()
        print(f"Unique values in '{excel_col}' (JSON key: '{json_key}'):")
        print(unique_vals)
        print("-" * 50)
    else:
        print(f"Column '{excel_col}' not found in Excel.")


Unique values in 'Location' (JSON key: 'Location'):
['inside private' 'inside public' 'outside public' 'outside private'
 'unknown' 'outside oublic' 'outside public ' 'outside private ']
--------------------------------------------------
Unique values in 'Context' (JSON key: 'context'):
['daily routine' 'toy play' 'general social communication interaction'
 'other ' 'book share' 'social routine' 'special occasion' 'motor play'
 'other' 'toy play ' 'general social interaction ']
--------------------------------------------------
Unique values in 'Child_of_interest_clear' (JSON key: 'child_clear'):
['yes' 'no' 'yes ']
--------------------------------------------------
Unique values in 'Gestures' (JSON key: 'gestures'):
['yes' 'no']
--------------------------------------------------
Unique values in 'Gesture_type' (JSON key: 'gesture_type'):
['reach' 'multiple' 'clap' 'take' 'point' 'other' 'head shake' 'show'
 'wave' 'give' 'mutiple' 'clapping' 'n/a ' 'head nod' 'reach ']
---------------

In [37]:


def clean_data(df):
    """Clean common data inconsistencies"""
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].astype(str).str.strip().str.lower()

    df['Location'] = df['Location'].replace({
        'outside oublic': 'outside public',
        'outside public ': 'outside public',
        'outside private ': 'outside private'
    })
    df['Context'] = df['Context'].replace({
        'other ': 'other',
        'toy play ': 'toy play',
        'general social interaction ': 'general social communication interaction'
    })
    df['Child_of_interest_clear'] = df['Child_of_interest_clear'].replace({
        'yes ': 'yes'
    })
    return df

def get_filename_list(df_subset):
    """Return filenames from a dataframe subset"""
    return df_subset['FileName'].tolist()

def create_video_groups(df):
    df = clean_data(df)
    groups = {}

    # Ensure numeric columns
    df['#_children'] = pd.to_numeric(df['#_children'], errors='coerce')
    df['#_adults'] = pd.to_numeric(df['#_adults'], errors='coerce')
    df['Video_Quality_Child_Face_Visibility'] = pd.to_numeric(df['Video_Quality_Child_Face_Visibility'], errors='coerce')
    df['Video_Quality_Child_Body_Visibility'] = pd.to_numeric(df['Video_Quality_Child_Body_Visibility'], errors='coerce')

    # 1. Face visibility
    groups['face_perfect'] = get_filename_list(df[df['Video_Quality_Child_Face_Visibility'] >= 9])
    groups['face_good'] = get_filename_list(df[(df['Video_Quality_Child_Face_Visibility'] >= 6) & 
                                               (df['Video_Quality_Child_Face_Visibility'] <= 8)])
    groups['face_medium'] = get_filename_list(df[(df['Video_Quality_Child_Face_Visibility'] >= 4) & 
                                                 (df['Video_Quality_Child_Face_Visibility'] < 6)])
    groups['face_poor'] = get_filename_list(df[df['Video_Quality_Child_Face_Visibility'] < 4])

    # 2. Body visibility
    groups['body_perfect'] = get_filename_list(df[df['Video_Quality_Child_Body_Visibility'] >= 9])
    groups['body_good'] = get_filename_list(df[(df['Video_Quality_Child_Body_Visibility'] >= 6) & 
                                               (df['Video_Quality_Child_Body_Visibility'] <= 8)])
    groups['body_poor'] = get_filename_list(df[df['Video_Quality_Child_Body_Visibility'] < 4])

    # 3. Face + body
    groups['face_body_perfect'] = get_filename_list(
        df[(df['Video_Quality_Child_Face_Visibility'] >= 9) &
           (df['Video_Quality_Child_Body_Visibility'] >= 9)]
    )
    groups['face_body_good'] = get_filename_list(
        df[(df['Video_Quality_Child_Face_Visibility'] >= 6) & 
           (df['Video_Quality_Child_Face_Visibility'] < 8) &
           (df['Video_Quality_Child_Body_Visibility'] >= 6) & 
           (df['Video_Quality_Child_Body_Visibility'] < 8)]
    )
    groups['face_body_poor'] = get_filename_list(   
        df[(df['Video_Quality_Child_Face_Visibility'] < 4) & 
           (df['Video_Quality_Child_Body_Visibility'] < 4)]
    )

    # 4. Interactions
    groups['single_child'] = get_filename_list(df[df['#_children'] == 1])
    groups['multiple_children'] = get_filename_list(df[df['#_children'] > 1])
    groups['no_adults'] = get_filename_list(df[df['#_adults'] == 0])
    groups['child_adult_interaction'] = get_filename_list(
        df[(df['#_adults'] >= 1) & 
           (df['#_children'] == 1) & 
           (df['Interaction_with_child'] == 'yes')]
    )

    # 5. Camera angle
    groups['front_view'] = get_filename_list(df[df['Angle_of_Body'] == 'front'])
    groups['side_profile'] = get_filename_list(df[df['Angle_of_Body'] == 'profile'])
    groups['variable_angle'] = get_filename_list(df[df['Angle_of_Body'] == 'variable'])

    # 6. Activity context
    groups['toy_play'] = get_filename_list(df[df['Context'] == 'toy play'])
    groups['daily_routine'] = get_filename_list(df[df['Context'] == 'daily routine'])
    groups['social_communication'] = get_filename_list(df[df['Context'] == 'general social communication interaction'])
    groups['motor_play'] = get_filename_list(df[df['Context'] == 'motor play'])
    groups['special_occasion'] = get_filename_list(df[df['Context'] == 'special occasion'])
    groups['book_share'] = get_filename_list(df[df['Context'] == 'book share'])

   # 7. Lighting
    groups['lighting_good'] = get_filename_list(df[df['Video_Quality_Lighting'] >= 8])
    groups['lighting_poor'] = get_filename_list(df[df['Video_Quality_Lighting'] <= 4])


    # 9. Environment (location)
    groups['inside_private'] = get_filename_list(df[df['Location'] == 'inside private'])
    groups['inside_public'] = get_filename_list(df[df['Location'] == 'inside public'])
    groups['outside_public'] = get_filename_list(df[df['Location'] == 'outside public'])
    groups['outside_private'] = get_filename_list(df[df['Location'] == 'outside private'])

    # 10. Constraint type
    groups['in_highchair'] = get_filename_list(df[df['Constraint_type'].isin(['highchair', 'chair'])])
    groups['on_lap'] = get_filename_list(df[df['Constraint_type'].isin(['lap', 'held'])])
    groups['free_movement'] = get_filename_list(df[df['Constraint_type'] == 'none'])

    # 11. Gestures / RMM / Locomotion
    groups['has_gesture'] = get_filename_list(df[df['Gesture_type'] != 'none'])
    groups['multiple_gestures'] = get_filename_list(df[df['Gesture_type'] == 'multiple'])
    groups['rmm_present'] = get_filename_list(df[df['RMM_type'] != 'none'])
    groups['locomotion_present'] = get_filename_list(df[df['Locomotion_type'] != 'none'])

    # 12. Composite groups
    groups['ideal_condition'] = get_filename_list(
        df[(df['Video_Quality_Child_Face_Visibility'] >= 9) &
           (df['Video_Quality_Child_Body_Visibility'] >= 9) &
           (df['Video_Quality_Lighting'] >= 8) &   
           (df['#_children'] == 1)]
    )

    groups['interaction_good_quality'] = get_filename_list(
        df[(df['Video_Quality_Child_Face_Visibility'] >= 8) &
           (df['#_adults'] >= 1) &
           (df['Interaction_with_child'] == 'yes')]
    )
    
    # 13. Resolution
    groups['resolution_high'] = get_filename_list(df[df['Video_Quality_Resolution'] >= 8])
    groups['resolution_low'] = get_filename_list(df[df['Video_Quality_Resolution'] <= 4])
    
    # 14. Motion
    groups['motion_good'] = get_filename_list(df[df['Video_Quality_Motion'] >= 8])
    groups['motion_poor'] = get_filename_list(df[df['Video_Quality_Motion'] <= 4])

    return groups


In [38]:
video_groups = create_video_groups(df)

for key, videos in video_groups.items():
    print(f"{key}: {len(videos)} videos")


face_perfect: 1128 videos
face_good: 1274 videos
face_medium: 694 videos
face_poor: 711 videos
body_perfect: 1535 videos
body_good: 1814 videos
body_poor: 163 videos
face_body_perfect: 804 videos
face_body_good: 273 videos
face_body_poor: 96 videos
single_child: 3014 videos
multiple_children: 774 videos
no_adults: 2851 videos
child_adult_interaction: 695 videos
front_view: 1055 videos
side_profile: 172 videos
variable_angle: 2448 videos
toy_play: 736 videos
daily_routine: 318 videos
social_communication: 1317 videos
motor_play: 623 videos
special_occasion: 169 videos
book_share: 88 videos
lighting_good: 3646 videos
lighting_poor: 58 videos
inside_private: 2839 videos
inside_public: 309 videos
outside_public: 385 videos
outside_private: 262 videos
in_highchair: 64 videos
on_lap: 51 videos
free_movement: 0 videos
has_gesture: 3902 videos
multiple_gestures: 242 videos
rmm_present: 3902 videos
locomotion_present: 3902 videos
ideal_condition: 677 videos
interaction_good_quality: 361 videos


In [None]:
video_groups = create_video_groups(df)
print("Perfect face videos:", video_groups['face_perfect'][0:5])  