In [1]:
#necessary imports here
import pandas as pd
import os
from itertools import combinations

In [2]:

# List of fold names
folds = [f"SEMPI_public/data/engagement/label_0402_fold_{i}" for i in range(5)]

# List to store DataFrames
all_dfs = []

# Read each train and val file directly
for fold in folds:
    train_df = pd.read_csv(f"{fold}/train.csv")
    val_df = pd.read_csv(f"{fold}/val.csv")
    
    # Append to list
    all_dfs.extend([train_df, val_df])

# Concatenate all data into a single DataFrame
scores_df = pd.concat(all_dfs, ignore_index=True)

sorted_df = scores_df.sort_values(by='video_path', ascending=True)


# Print shape
print("Final Combined Data Shape:", scores_df.shape)


Final Combined Data Shape: (15665, 2)


In [3]:
print(scores_df)

                                              video_path    engagement
0      Te_Awamutu_Community_Board_Meeting_14_Septembe... -1.333333e-01
1      Group_therapy_video_2/clip0009/Group_therapy_v... -1.734723e-17
2      Early_Mid_StageCare_Support_Group_Webinar_10th... -6.666667e-02
3      Dry_Eye_Zoom_Group_October_14_2022/clip0001/Dr... -3.333333e-02
4      zoom_group_therapy_session_1/clip0026/zoom_gro... -2.500000e-01
...                                                  ...           ...
15660  PTSD_Buddies_Zoom_Group_Support_Meeting/clip00...  2.500000e-01
15661  Stroke_Buddies_Support_Group_Meeting_#5/clip00... -1.000000e-01
15662  PTSD_Buddies_Zoom_Group_Support_Meeting/clip00... -2.166667e-01
15663  Stroke_Buddies_Support_Group_Meeting_#5/clip00... -2.333333e-01
15664  PTSD_Buddies_Zoom_Group_Support_Meeting/clip00...  1.500000e-01

[15665 rows x 2 columns]


In [4]:
#read in the raw files to find if the person is a speaker or not
annotations_df = pd.read_csv("raw_annotations.csv")
ids_df = pd.read_csv("raw_annotations_video_ids.csv")


annotations_df = annotations_df[["Speaking Question", "Vid_num"]]

#print(annotations_df)
#print(ids_df)

In [5]:
#need to drop duplicates (same no. anyways, only focusing on if speaker)
annotations_unique = annotations_df.drop_duplicates(subset="Vid_num")
ids_unique = ids_df.drop_duplicates(subset="Vid_num")

#merge
merged_df = pd.merge(annotations_unique, ids_unique, on="Vid_num", how="inner")

#drop video prefi and the unnamed row
merged_df = merged_df.drop("Unnamed: 0", axis=1)
merged_df = merged_df.drop("Vid_num", axis=1)
#merged_df["Video Link"] = merged_df["Video Link"].str.replace("data/engagement/video/", "", regex=False)
merged_df["Video Link"] = merged_df["Video Link"].str.split("/").str[-1]
merged_df["Video Link"] = merged_df["Video Link"].str.replace(r'mp4$', 'csv', regex=True)


#drop the 0th row (not relevatn to data)
merged_df = merged_df.drop(index=0).reset_index(drop=True)

#replace yes/no with 0/1
merged_df["Speaking Question"] = merged_df["Speaking Question"].map({"No": 0, "Yes": 1})

print(merged_df)


speaker_lookup = dict(zip(merged_df["Video Link"].str.split("/").str[-1], merged_df["Speaking Question"]))

      Speaking Question                                         Video Link
0                   0.0  Te_Awamutu_Community_Board_Meeting_14_Septembe...
1                   0.0                   Group_therapy_video_27_clip3.csv
2                   0.0  Early_Mid_StageCare_Support_Group_Webinar_10th...
3                   0.0  Stroke_Buddies_Support_Group_Meeting_#5101_cli...
4                   0.0                   August_Facebook_Live47_clip7.csv
...                 ...                                                ...
8357                0.0                       Zoom_Focus_Group74_clip6.csv
8358                0.0     Dry_Eye_Zoom_Group_October_14_202249_clip4.csv
8359                0.0                   August_Facebook_Live51_clip8.csv
8360                0.0    Dry_Eye_Zoom_Group_October_14_2022101_clip1.csv
8361                0.0  Early_Mid_StageCare_Support_Group_Webinar_10th...

[8362 rows x 2 columns]


In [8]:
# Base directory containing all folders
base_path = "SEMPI_public/data/engagement/featopenface/"

# Extract only the filename from `video_path` and append `.csv`
sorted_df["filename"] = sorted_df["video_path"].apply(lambda x: x.split("/")[-1] + ".csv")

# Convert extracted filenames to a set for quick lookup
video_paths_set = set(sorted_df["filename"])

# Dictionary to store merged DataFrames for all folders
all_merged_dataframes = {}

# Loop through each subfolder inside `featopenface/`
for main_folder in os.listdir(base_path):
    main_folder_path = os.path.join(base_path, main_folder)

    # Ensure it's a directory (skip files)
    if not os.path.isdir(main_folder_path):
        continue

    print(f"Processing folder: {main_folder}")  # Show progress

    # Dictionary to store merged DataFrames per clip folder
    merged_dataframes = {}

    # Loop through all clip folders inside the main folder
    for folder in os.listdir(main_folder_path):
        folder_path = os.path.join(main_folder_path, folder)

        # Ensure it's a directory
        if not os.path.isdir(folder_path):
            continue

        # Store valid DataFrames for this folder
        valid_named_dfs = []

        # Iterate through files in the folder
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)

            # Extract engagement score if filename matches
            if file in video_paths_set:
                try:
                    df = pd.read_csv(file_path)  # Adjust for different file formats if needed

                    # Get the engagement value using the cleaned filename
                    engagement_value = sorted_df.loc[sorted_df["filename"] == file, "engagement"].values[0]
                    speaker_val = speaker_lookup.get(file, None)
                    
                    # Add an engagement column
                    df["is_speaker"] = speaker_val
                    df["engagement"] = engagement_value

                    # Store DataFrame in the valid list
                    valid_named_dfs.append((file, df)) 

                except Exception as e:
                    print(f"Could not read {file}: {e}")

        # Skip if there are not at least 2 valid DataFrames
        if len(valid_named_dfs) < 2:
            continue  

        # Create all possible pairs and merge them
        merged_pairs = []
        # Split into speakers and non-speakers
        speakers = [(name, df) for (name, df) in valid_named_dfs if df["is_speaker"].iloc[0] == 1]
        non_speakers = [(name, df) for (name, df) in valid_named_dfs if df["is_speaker"].iloc[0] == 0]

        # Pair each speaker with each non-speaker
        for (name1, df1) in speakers:
            for (name2, df2) in non_speakers:
            # Skip if same file or df accidentally duplicates
                if name1 == name2 or df1.equals(df2):
                    continue

                # Reset indices
                df1 = df1.reset_index(drop=True)
                df2 = df2.reset_index(drop=True)

                # Extract features and labels
                df1_features = df1.drop(columns=["is_speaker", "engagement"])
                df2_features = df2.drop(columns=["is_speaker", "engagement"])

                df1_speaker = df1["is_speaker"].reset_index(drop=True)
                df2_speaker = df2["is_speaker"].reset_index(drop=True)

                df1_engagement = df1["engagement"].reset_index(drop=True)
                df2_engagement = df2["engagement"].reset_index(drop=True)

                # Concatenate
                combined_df = pd.concat([
                    df1_features,
                    df1_speaker.rename("speaker_1"),
                    df1_engagement.rename("engagement_1"),
                    df2_features,
                    df2_speaker.rename("speaker_2"),
                    df2_engagement.rename("engagement_2")
                ], axis=1)

                merged_pairs.append(combined_df)

        # Store merged DataFrames for this clip folder
        merged_dataframes[folder] = merged_pairs

    # Store merged results for the current main folder
    all_merged_dataframes[main_folder] = merged_dataframes

# Display an example merged DataFrame
# Safely find the first non-empty merged DataFrame
for main_folder, clips in all_merged_dataframes.items():
    for clip_folder, merged_list in clips.items():
        if merged_list:  # Check if list is non-empty
            first_pair = merged_list[0]
            print(f"Showing first merged pair from {main_folder}/{clip_folder}")
            display(first_pair)
            break
    else:
        continue
    break

# Print total number of merged pairs
total_pairs = sum(len(v) for main_folder in all_merged_dataframes.values() for v in main_folder.values())
#print(f"Generated {total_pairs} merged DataFrames across {len(all_merged_dataframes)} main folders.")
#print(all_merged_dataframes)


Processing folder: Ask_a_Therapist_How_to_Manage_Mental_Health_During_a_Pandemic
Processing folder: August_Facebook_Live
Processing folder: Dry_Eye_Zoom_Group_October_14_2022
Processing folder: Early_Mid_StageCare_Support_Group_Webinar_10th_September_2021
Processing folder: grief_support_group
Processing folder: Group_therapy_video_2
Processing folder: Mock_Group_Therapy_Session_Substance_Abuse
Processing folder: PCA_Support_Group_Webina-9th_December_2022
Processing folder: PCA_Support_Group_Webinar_-_1st_December_2023
Processing folder: PTSD_Buddies_Zoom_Group_Support_Meeting
Processing folder: Stroke_Buddies_Support_Group_Meeting_#5
Processing folder: Te_Awamutu_Community_Board_Meeting_14_September_2021
Processing folder: Zoom_Focus_Group
Processing folder: zoom_group_therapy_session_1
Showing first merged pair from Ask_a_Therapist_How_to_Manage_Mental_Health_During_a_Pandemic/clip0011


Unnamed: 0,frame,face_id,timestamp,confidence,success,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,speaker_2,engagement_2
0,1,0,0.0,0.98,1,0.173508,-0.017093,-0.984684,-0.00931,-0.04282,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
1,2,0,0.0,0.98,1,0.17085,-0.038442,-0.984547,-0.04859,-0.036842,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
2,3,0,0.0,0.98,1,0.123056,-0.008911,-0.99236,-0.116079,-0.020089,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
3,4,0,0.0,0.98,1,0.152552,0.159316,-0.97537,-0.033447,0.082646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
4,5,0,0.0,0.98,1,0.152123,0.146761,-0.977405,-0.032364,0.087412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
5,6,0,0.0,0.98,1,0.123163,-0.004165,-0.992378,-0.011261,0.036783,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
6,7,0,0.0,0.98,1,0.262196,0.242372,-0.934082,0.008739,0.170361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
7,8,0,0.0,0.98,1,0.22909,0.175307,-0.957489,-0.039784,0.152037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
8,9,0,0.0,0.98,1,0.157399,-0.044654,-0.986525,-0.000693,-0.014296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.133333
9,10,0,0.0,0.98,1,0.200674,-0.031161,-0.979162,-0.045729,-0.02241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.133333


In [10]:
# List to store all DataFrames before concatenation
all_dfs_list = []

# Iterate through all stored merged DataFrames
for main_folder, clip_folders in all_merged_dataframes.items():
    for clip_folder, merged_dfs in clip_folders.items():
        for df in merged_dfs:
            # Add metadata columns for traceability
            df["main_folder"] = main_folder
            df["clip_folder"] = clip_folder
            all_dfs_list.append(df)

# Concatenate all DataFrames into one
if all_dfs_list:
    final_df = pd.concat(all_dfs_list, ignore_index=True)


    # Save to CSV
    final_df.to_csv("all_merged_dataframes_withspeakerColumn.csv", index=False)

    print(f"Successfully saved {len(final_df)} rows to 'all_merged_dataframes.csv'.")
else:
    print("No DataFrames to save.")


Successfully saved 40404 rows to 'all_merged_dataframes.csv'.
