In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%%capture
!unzip /content/drive/MyDrive/data/behavior_results.zip
!unzip /content/drive/MyDrive/data/reference_behavior_results.zip

In [4]:
import pandas as pd
import numpy as np
from statsmodels.stats.inter_rater import fleiss_kappa
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import cohen_kappa_score
from itertools import combinations

sheet_id = "1sQEGYhR2XTuOIeDERvie_BJxmxRxsQMA88bDS6uPxlM"
sheet_name = "Summary"

csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

label = pd.read_csv(csv_url)
label = label[['meeting', 'participant','video_clip','DucManh', 'Charlotte', 'Ducanh', 'Money']]

# excluding 'Money'
label_cols_3 = ['DucManh', 'Charlotte', 'Ducanh']
def majority_vote(row):
    return row.value_counts().idxmax()

def vote_agreement_level(row):
    counts = row.value_counts(normalize=True)
    max_vote_ratio = counts.max()
    if max_vote_ratio == 1.0:
        return 'full'
    elif max_vote_ratio >= 2/3:
        return 'majority'
    else:
        return 'disagree'
label_df_3 = label[['meeting']+['participant']+['video_clip'] + label_cols_3].dropna()
label_df_3['segment_id'] = label_df_3.apply(lambda x: f"{x['meeting']}-{x['participant']}_{x['video_clip'].replace('.mp4','')}", axis=1)
label_df_3['agreement'] = label_df_3[label_cols_3].apply(
    lambda row: row.value_counts(normalize=True).max(), axis=1
)
label_df_3['majority_label_3'] = label_df_3[label_cols_3].apply(majority_vote, axis=1)
label_df_3['vote_agreement'] = label_df_3[label_cols_3].apply(vote_agreement_level, axis=1)
print(label_df_3.groupby('vote_agreement').size())
label_df_3 = label_df_3[label_df_3['vote_agreement'] != 'disagree'].copy()
label_df_3 = label_df_3[label_df_3['majority_label_3'] != 'unclear'].copy()
print(label_df_3.groupby('vote_agreement').size())
label_df_3



vote_agreement
disagree     4
full        47
majority    56
dtype: int64
vote_agreement
full        46
majority    56
dtype: int64


Unnamed: 0,meeting,participant,video_clip,DucManh,Charlotte,Ducanh,segment_id,agreement,majority_label_3,vote_agreement
0,20210323,SP01F,clip_732_737.mp4,Active engagement,Active engagement,Active engagement,20210323-SP01F_clip_732_737,1.000000,Active engagement,full
1,20210323,SP01F,clip_747_752.mp4,Active engagement,Active engagement,Active engagement,20210323-SP01F_clip_747_752,1.000000,Active engagement,full
2,20210323,SP01F,clip_837_842.mp4,Intermittent engagement,Active engagement,Active engagement,20210323-SP01F_clip_837_842,0.666667,Active engagement,majority
3,20210323,SP01F,clip_846_851.mp4,Active engagement,Active engagement,Intermittent engagement,20210323-SP01F_clip_846_851,0.666667,Active engagement,majority
4,20210323,SP01F,clip_849_854.mp4,Active engagement,Active engagement,Intermittent engagement,20210323-SP01F_clip_849_854,0.666667,Active engagement,majority
...,...,...,...,...,...,...,...,...,...,...
102,20220408,SP09M,clip_405_410.mp4,Intermittent engagement,Intermittent engagement,Intermittent engagement,20220408-SP09M_clip_405_410,1.000000,Intermittent engagement,full
103,20220408,SP09M,clip_585_590.mp4,Disengagement,Intermittent engagement,Disengagement,20220408-SP09M_clip_585_590,0.666667,Disengagement,majority
104,20220408,SP09M,clip_3165_3170.mp4,Intermittent engagement,Active engagement,Active engagement,20220408-SP09M_clip_3165_3170,0.666667,Active engagement,majority
105,20220408,SP09M,clip_3168_3173.mp4,Intermittent engagement,Active engagement,Intermittent engagement,20220408-SP09M_clip_3168_3173,0.666667,Intermittent engagement,majority


In [5]:

import pickle

# Read the pickle file
with open('/content/drive/MyDrive/data/dnn_feature.pickle', 'rb') as f:
    dnn_feature = pickle.load(f)

with open('/content/drive/MyDrive/data/dnn_feature_reference.pickle', 'rb') as f:
    dnn_feature_reference = pickle.load(f)


In [6]:
# prompt: read /content/drive/MyDrive/data/target_segmentation.csv

import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/data/target_segmentation.csv')
df.head()

Unnamed: 0,folder_path,meeting_id,person_id,file_name,start_time,end_time,full_file_path,reference_persons
0,/content/drive/MyDrive/filtered_clips/20210323...,20210323,SP07F,clip_378_383.mp4,378,383,/content/drive/MyDrive/filtered_clips/20210323...,"['SP01F', 'SP06M', 'SP03M']"
1,/content/drive/MyDrive/filtered_clips/20210323...,20210323,SP07F,clip_1113_1118.mp4,1113,1118,/content/drive/MyDrive/filtered_clips/20210323...,"['SP01F', 'SP03M', 'SP06M']"
2,/content/drive/MyDrive/filtered_clips/20210323...,20210323,SP07F,clip_1074_1079.mp4,1074,1079,/content/drive/MyDrive/filtered_clips/20210323...,"['SP06M', 'SP03M', 'SP01F']"
3,/content/drive/MyDrive/filtered_clips/20210323...,20210323,SP07F,clip_1182_1187.mp4,1182,1187,/content/drive/MyDrive/filtered_clips/20210323...,"['SP06M', 'SP01F', 'SP03M']"
4,/content/drive/MyDrive/filtered_clips/20210323...,20210323,SP07F,clip_255_260.mp4,255,260,/content/drive/MyDrive/filtered_clips/20210323...,"['SP03M', 'SP01F', 'SP06M']"


In [7]:
# prompt: concat all the csv
import os
import ast

def get_file_path(row):
  # Assuming the row has columns that can construct the file path
  # For example, if row has 'date', 'animal_id', 'clip_start', 'clip_end'
  date = row['meeting_id'] # Replace 'date' with the actual column name
  person_id = row['person_id'] # Replace 'animal_id' with the actual column name
  clip_start = row['start_time'] # Replace 'clip_start' with the actual column name
  clip_end = row['end_time'] # Replace 'clip_end' with the actual column name
  return f"/content/behavior_results/{date}-{person_id}/clip_{clip_start}_{clip_end}.csv"
c=0
def get_reference_file_path(row):
  global c
  # Assuming the row has columns that can construct the file path
  # For example, if row has 'date', 'animal_id', 'clip_start', 'clip_end'
  date = row['meeting_id'] # Replace 'date' with the actual column name
  clip_start = row['start_time'] # Replace 'clip_start' with the actual column name
  clip_end = row['end_time'] # Replace 'clip_end' with the actual column name
  file_paths = []
  for person_id in ast.literal_eval(row['reference_persons']):
    file_path = f"/content/reference_behavior_results/{date}-{person_id}/clip_{clip_start}_{clip_end}.csv"
    if os.path.exists(file_path):
      file_paths.append(file_path)
  if len(file_paths)<2:
    c+=1
    print("error lanking reference", len(file_paths))
  return file_paths

all_features_df = pd.DataFrame()
all_features_ref1 = pd.DataFrame()
all_features_ref2 = pd.DataFrame()

for index, row in df.iterrows():
  reference_file_paths = get_reference_file_path(row)
  csv_file_path = get_file_path(row)
  if os.path.exists(csv_file_path):
    try:
      features_df = pd.read_csv(csv_file_path)
      all_features_df = pd.concat([all_features_df, features_df], ignore_index=True)
    except Exception as e:
      print(f"An error occurred while reading {csv_file_path}: {e}")
  else:
    print(f"File not found: {csv_file_path}")
    # prompt: do the same with refence feature

  if len(reference_file_paths) > 0 and os.path.exists(reference_file_paths[0]):
    try:
      features_ref1 = pd.read_csv(reference_file_paths[0])
      all_features_ref1 = pd.concat([all_features_ref1, features_ref1], ignore_index=True)
    except Exception as e:
      print(f"An error occurred while reading {reference_file_paths[0]}: {e}")
  else:
    all_features_ref1 = pd.concat([all_features_ref1, pd.DataFrame([{}])], ignore_index=True)
    if len(reference_file_paths) > 0:
      print(f"File not found: {reference_file_paths[0]}")

  # Concatenate features from the second reference file
  if len(reference_file_paths) > 1 and os.path.exists(reference_file_paths[1]):
    try:
      features_ref2 = pd.read_csv(reference_file_paths[1])
      all_features_ref2 = pd.concat([all_features_ref2, features_ref2], ignore_index=True)
    except Exception as e:
      print(f"An error occurred while reading {reference_file_paths[1]}: {e}")
  else:
      all_features_ref2 = pd.concat([all_features_ref2, pd.DataFrame([{}])], ignore_index=True)
      if len(reference_file_paths) > 1:
        print(f"File not found: {reference_file_paths[1]}")

print("Concatenated Reference 1 DataFrame shape:", all_features_ref1.shape)
print("Concatenated Reference 2 DataFrame shape:", all_features_ref2.shape)
print("Concatenated DataFrame shape:", all_features_df.shape)
print("Lacking reference",c)

error lanking reference 1
error lanking reference 1
error lanking reference 1
error lanking reference 0
error lanking reference 0
error lanking reference 0
error lanking reference 0
error lanking reference 1
error lanking reference 1
error lanking reference 1
error lanking reference 0
error lanking reference 1
error lanking reference 1
error lanking reference 1
error lanking reference 0
error lanking reference 0
error lanking reference 1
error lanking reference 1
error lanking reference 1
error lanking reference 0
error lanking reference 0
error lanking reference 1
error lanking reference 1
error lanking reference 0
error lanking reference 1
error lanking reference 0
error lanking reference 1
error lanking reference 1
error lanking reference 1
error lanking reference 0
error lanking reference 1
error lanking reference 0
error lanking reference 1
error lanking reference 1
error lanking reference 0
error lanking reference 0
error lanking reference 1
error lanking reference 1
error lankin

In [8]:
all_features_ref1

Unnamed: 0,segment_id,duration,video_path,start_time,end_time,valid_frames,avg_pose,pose_std,segment,num_valid_transitions,...,AU45_r_duration,chin_rest_detected,chin_rest_frame_ratio,touching_face_detected,touching_face_frame_ratio,support_forehead_detected,support_forehead_frame_ratio,hand_raise_detected,hand_raise_frame_ratio,clip_name
0,20210323-SP01F_clip_378_383,5.0,/content/content/reference_clip/20210323-SP01F...,378.0,383.0,126.0,"[-23.982031966138386, 25.887002006773834, 179....","[2.641305967004011, 1.498288288585553, 2.20640...","(378, 383)",125.0,...,0.48,False,0.0,False,0.0,False,0.0,False,0.00,clip_378_383.mp4
1,20210323-SP01F_clip_1113_1118,5.0,/content/content/reference_clip/20210323-SP01F...,1113.0,1118.0,126.0,"[-42.03995220760016, 27.480209132219365, -166....","[4.204461775768183, 1.281689362635023, 0.88809...","(1113, 1118)",125.0,...,0.00,False,0.0,False,0.0,False,0.0,False,0.00,clip_1113_1118.mp4
2,20210323-SP06M_clip_1074_1079,5.0,/content/content/reference_clip/20210323-SP06M...,1074.0,1079.0,126.0,"[22.120582578287895, 22.3158813791212, 168.735...","[2.36528330220425, 1.3952873563511894, 1.29779...","(1074, 1079)",125.0,...,0.04,False,0.0,False,0.0,False,0.0,False,0.00,clip_1074_1079.mp4
3,20210323-SP06M_clip_1182_1187,5.0,/content/content/reference_clip/20210323-SP06M...,1182.0,1187.0,126.0,"[17.31989267131847, 23.09387887171484, 171.671...","[2.7097604509110518, 1.1140912993786527, 0.916...","(1182, 1187)",125.0,...,0.04,False,0.0,False,0.0,False,0.0,False,0.00,clip_1182_1187.mp4
4,20210323-SP03M_clip_255_260,5.0,/content/content/reference_clip/20210323-SP03M...,255.0,260.0,126.0,"[46.44568364985898, 36.5801491866292, 177.1966...","[5.77810521165361, 1.763964217049017, 1.924692...","(255, 260)",125.0,...,0.04,False,0.0,False,0.0,False,0.0,False,0.00,clip_255_260.mp4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626,20220916-SP03M_clip_170_175,5.0,/content/content/reference_clip/20220916-SP03M...,170.0,175.0,126.0,"[-1.5842548036780788, 25.785761665602312, -179...","[1.962343402222446, 1.2142468517563407, 0.7265...","(170, 175)",125.0,...,0.64,False,0.0,False,0.0,False,0.0,False,0.00,clip_170_175.mp4
627,20220916-SP03M_clip_0_5,5.0,/content/content/reference_clip/20220916-SP03M...,0.0,5.0,121.0,"[0.7987719690956768, 28.6503157046054, 178.010...","[3.417462827981237, 2.049161058339137, 1.23232...","(0, 5)",120.0,...,1.04,False,0.0,False,0.0,False,0.0,False,0.00,clip_0_5.mp4
628,20220916-SP05F_clip_235_240,5.0,/content/content/reference_clip/20220916-SP05F...,235.0,240.0,126.0,"[-1.2291309610521781, 22.143107851925922, 177....","[4.428691081492357, 3.0609770270401313, 4.4655...","(235, 240)",125.0,...,1.04,False,0.0,False,0.0,False,0.0,False,0.00,clip_235_240.mp4
629,20220916-SP04F_clip_5_10,5.0,/content/content/reference_clip/20220916-SP04F...,5.0,10.0,126.0,"[-8.625701743079695, 21.264191382523318, -173....","[3.162785592113953, 2.1922590669373294, 1.2912...","(5, 10)",125.0,...,1.04,False,0.0,False,0.0,False,0.0,False,0.00,clip_5_10.mp4


In [9]:
all_features_ref2

Unnamed: 0,segment_id,duration,video_path,start_time,end_time,valid_frames,avg_pose,pose_std,segment,num_valid_transitions,...,AU45_r_duration,chin_rest_detected,chin_rest_frame_ratio,touching_face_detected,touching_face_frame_ratio,support_forehead_detected,support_forehead_frame_ratio,hand_raise_detected,hand_raise_frame_ratio,clip_name
0,20210323-SP06M_clip_378_383,5.0,/content/content/reference_clip/20210323-SP06M...,378.0,383.0,126.0,"[-3.078479217421869, 24.666117899730004, 176.6...","[2.2089049213538967, 0.7731939310339792, 0.971...","(378, 383)",125.0,...,0.04,False,0.000,False,0.000,False,0.000,False,0.0,clip_378_383.mp4
1,20210323-SP03M_clip_1113_1118,5.0,/content/content/reference_clip/20210323-SP03M...,1113.0,1118.0,126.0,"[-1.0203919666780152, 26.307214856170766, 177....","[2.8746542796055254, 1.3223817662564963, 1.295...","(1113, 1118)",125.0,...,0.48,False,0.000,False,0.000,False,0.000,False,0.0,clip_1113_1118.mp4
2,20210323-SP03M_clip_1074_1079,5.0,/content/content/reference_clip/20210323-SP03M...,1074.0,1079.0,126.0,"[-3.270940913024445, 27.572546816724127, -178....","[2.339839554161804, 0.8085793034780608, 1.4436...","(1074, 1079)",125.0,...,0.48,False,0.000,False,0.000,False,0.000,False,0.0,clip_1074_1079.mp4
3,20210323-SP01F_clip_1182_1187,5.0,/content/content/reference_clip/20210323-SP01F...,1182.0,1187.0,126.0,"[6.043316631352067, 23.236435152206642, -177.3...","[12.349043116728838, 2.099725768279239, 3.2982...","(1182, 1187)",125.0,...,0.48,False,0.000,False,0.000,False,0.000,False,0.0,clip_1182_1187.mp4
4,20210323-SP01F_clip_255_260,5.0,/content/content/reference_clip/20210323-SP01F...,255.0,260.0,126.0,"[-18.488218431374484, 27.663513930408648, -179...","[12.448951310944969, 1.1542094243052037, 1.597...","(255, 260)",125.0,...,0.48,False,0.000,False,0.000,False,0.000,False,0.0,clip_255_260.mp4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626,20220916-SP04F_clip_170_175,5.0,/content/content/reference_clip/20220916-SP04F...,170.0,175.0,126.0,"[1.1535208357646158, 24.580112330111366, 179.6...","[5.444830807901308, 1.5862895581756207, 2.9398...","(170, 175)",125.0,...,1.04,False,0.000,False,0.000,False,0.000,False,0.0,clip_170_175.mp4
627,20220916-SP05F_clip_0_5,5.0,/content/content/reference_clip/20220916-SP05F...,0.0,5.0,121.0,"[-7.012214102436831, 23.61696465074466, -172.8...","[4.96796108614936, 1.8939807112765286, 9.22887...","(0, 5)",120.0,...,1.04,False,0.000,True,0.132,False,0.000,False,0.0,clip_0_5.mp4
628,20220916-SP01F_clip_235_240,5.0,/content/content/reference_clip/20220916-SP01F...,235.0,240.0,126.0,"[-5.059829658388944, 24.764254376271253, 178.9...","[6.588772585971992, 2.547260650427872, 1.89463...","(235, 240)",125.0,...,1.04,False,0.000,False,0.000,False,0.000,False,0.0,clip_235_240.mp4
629,20220916-SP05F_clip_5_10,5.0,/content/content/reference_clip/20220916-SP05F...,5.0,10.0,123.0,"[-7.829189114015408, 25.294581677016886, -173....","[5.861869755891109, 2.6486196237760575, 4.1251...","(5, 10)",122.0,...,1.04,False,0.040,False,0.000,False,0.008,False,0.0,clip_5_10.mp4


In [13]:
# prompt: for each row extract feature using features list, for each feature convert categories to numerical onehot, boodlan to 0/1 value and concat all to a single vector

import pandas as pd
import numpy as np
import json
features = [
"valid_frames",
"avg_pose",
"pose_std",
"num_valid_transitions",
"avg_time_between_detections",
"avg_yaw_velocity",
"avg_pitch_velocity",
"avg_roll_velocity",
"max_yaw_velocity",
"max_pitch_velocity",
"max_roll_velocity",
"total_yaw_movement",
"total_pitch_movement",
"total_roll_movement",
"yaw_velocity_std",
"pitch_velocity_std",
"roll_velocity_std",
"movement_intensity",
"movement_classification",
"nod_count",
"nod_frequency",
"nod_type",
"fast_nod_count",
"slow_nod_count",
"shake_count",
"shake_frequency",
"shake_type",
"chin_rest_detected",
"chin_rest_frame_ratio",
"touching_face_detected",
"touching_face_frame_ratio",
"support_forehead_detected",
"support_forehead_frame_ratio",
"hand_raise_detected",
"hand_raise_frame_ratio",
]
def preprocess_manual_features(row, features):
    feature_vector = []
    for feature in features:
        print(feature, len(feature_vector))
        value = row.get(feature)
        if pd.isna(value):
            # print(f"Warning: Missing value for feature '{feature}' in row. Skipping.")
            return None

        if feature == 'movement_classification':
            # One-hot encode 'movement_classification'
            categories = ['High movement', 'Moderate movement', 'Low movement', 'Static'] # Define possible categories
            one_hot_vector = [1 if value == cat else 0 for cat in categories]
            feature_vector.extend(one_hot_vector)
        elif feature == 'avg_pose' or feature == 'pose_std':
            value = json.loads(value)
            feature_vector.extend(value)
        elif feature == 'nod_type' or feature == 'shake_type':
             # One-hot encode 'nod_type' and 'shake_type'
            categories = ['none', 'occasional', 'frequent'] # Define possible categories
            one_hot_vector = [1 if value == cat else 0 for cat in categories]
            feature_vector.extend(one_hot_vector)
        elif isinstance(value, bool):
            # Convert boolean to 0/1
            feature_vector.append(int(value))
        elif isinstance(value, np.ndarray):
            # Extend with elements of numpy array (like dnn_feature)
            feature_vector.extend(value.tolist())
        elif isinstance(value, (int, float)):
            # Append numerical values
            feature_vector.append(value)
        else:
            # Handle other types or raise an error
            print(f"Warning: Unexpected data type for feature '{feature}': {type(value)}. Skipping.")
    return np.array(feature_vector)




In [14]:
# prompt: # prompt: process feature that start with AU just concant all float value

import numpy as np
def process_au_features(row):
    """
    Processes features that start with 'AU' for a given row.
    Concatenates all float values from these features into a single numpy array.
    """
    au_features = []
    for col in row.index:
        if col.startswith('AU'):
            value = row[col]
            # Check if the value is a list or numpy array of floats
            if isinstance(value, (list, np.ndarray)):
                # Ensure all elements are floats before extending
                if all(isinstance(x, (int, float)) for x in value):
                    au_features.extend(value)
            elif isinstance(value, (int, float)):
                 au_features.append(value)


    return np.array(au_features)



In [15]:
# Create a new column 'feature_vector' by applying the function to each row
all_features_df['manual_feature'] = all_features_df.apply(
    lambda row: preprocess_manual_features(row, features), axis=1
)

# Apply the function to create the 'AU_feature' column
all_features_df['AU_feature'] = all_features_df.apply(process_au_features, axis=1)
all_features_df['dnn_feature'] = all_features_df['video_path'].apply(
    lambda x: dnn_feature.get(x.replace('/content/filtered_clips/', ''), None)
)
print("Checking for None in 'dnn_feature' column:")
print(all_features_df['dnn_feature'].isnull().any())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
avg_roll_velocity 11
max_yaw_velocity 12
max_pitch_velocity 13
max_roll_velocity 14
total_yaw_movement 15
total_pitch_movement 16
total_roll_movement 17
yaw_velocity_std 18
pitch_velocity_std 19
roll_velocity_std 20
movement_intensity 21
movement_classification 22
nod_count 26
nod_frequency 27
nod_type 28
fast_nod_count 31
slow_nod_count 32
shake_count 33
shake_frequency 34
shake_type 35
chin_rest_detected 38
chin_rest_frame_ratio 39
touching_face_detected 40
touching_face_frame_ratio 41
support_forehead_detected 42
support_forehead_frame_ratio 43
hand_raise_detected 44
hand_raise_frame_ratio 45
valid_frames 0
avg_pose 1
pose_std 4
num_valid_transitions 7
avg_time_between_detections 8
avg_yaw_velocity 9
avg_pitch_velocity 10
avg_roll_velocity 11
max_yaw_velocity 12
max_pitch_velocity 13
max_roll_velocity 14
total_yaw_movement 15
total_pitch_movement 16
total_roll_movement 17
yaw_velocity_std 18
pitch_velocity_std 19
roll_

In [None]:
# prompt: apply with refence features

# Create a new column 'feature_vector' by applying the function to each row for ref1
all_features_ref1['manual_feature'] = all_features_ref1.apply(
    lambda row: preprocess_manual_features(row, features), axis=1
)

# Apply the function to create the 'AU_feature' column for ref1
all_features_ref1['AU_feature'] = all_features_ref1.apply(process_au_features, axis=1)

# Create a new column 'feature_vector' by applying the function to each row for ref2
all_features_ref2['manual_feature'] = all_features_ref2.apply(
    lambda row: preprocess_manual_features(row, features), axis=1
)

# Apply the function to create the 'AU_feature' column for ref2
all_features_ref2['AU_feature'] = all_features_ref2.apply(process_au_features, axis=1)



In [None]:
all_features_ref1[['manual_feature','AU_feature']]

Unnamed: 0,manual_feature,AU_feature
0,"[126.0, -23.982031966138386, 25.88700200677383...","[0.2984126984126984, 0.581175928886822, 3.0, 3..."
1,"[126.0, -42.03995220760016, 27.480209132219365...","[0.053015873015873, 0.0807590969862327, 0.0, -..."
2,"[126.0, 22.120582578287895, 22.3158813791212, ...","[0.1348412698412698, 0.2224328541451883, 6.0, ..."
3,"[126.0, 17.31989267131847, 23.09387887171484, ...","[0.1348412698412698, 0.2224328541451883, 6.0, ..."
4,"[126.0, 46.44568364985898, 36.5801491866292, 1...","[0.088015873015873, 0.1128752928945557, 0.0, -..."
...,...,...
626,"[126.0, -1.5842548036780788, 25.78576166560231...","[0.0781746031746031, 0.105666651644884, 0.0, -..."
627,"[121.0, 0.7987719690956768, 28.6503157046054, ...","[0.0775396825396825, 0.1268806463283444, 1.0, ..."
628,"[126.0, -1.2291309610521781, 22.14310785192592...","[0.0775396825396825, 0.1268806463283444, 1.0, ..."
629,"[126.0, -8.625701743079695, 21.264191382523318...","[0.0775396825396825, 0.1268806463283444, 1.0, ..."


In [None]:
all_features_ref2[['manual_feature','AU_feature']]


Unnamed: 0,manual_feature,AU_feature
0,"[126.0, -3.078479217421869, 24.666117899730004...","[0.1348412698412698, 0.2224328541451883, 6.0, ..."
1,"[126.0, -1.0203919666780152, 26.30721485617076...","[0.2984126984126984, 0.581175928886822, 3.0, 3..."
2,"[126.0, -3.270940913024445, 27.572546816724127...","[0.2984126984126984, 0.581175928886822, 3.0, 3..."
3,"[126.0, 6.043316631352067, 23.236435152206642,...","[0.2984126984126984, 0.581175928886822, 3.0, 3..."
4,"[126.0, -18.488218431374484, 27.66351393040864...","[0.2984126984126984, 0.581175928886822, 3.0, 3..."
...,...,...
626,"[126.0, 1.1535208357646158, 24.580112330111366...","[0.0775396825396825, 0.1268806463283444, 1.0, ..."
627,"[121.0, -7.012214102436831, 23.61696465074466,...","[0.0775396825396825, 0.1268806463283444, 1.0, ..."
628,"[126.0, -5.059829658388944, 24.764254376271253...","[0.0775396825396825, 0.1268806463283444, 1.0, ..."
629,"[123.0, -7.829189114015408, 25.294581677016886...","[0.0775396825396825, 0.1268806463283444, 1.0, ..."


In [None]:
ref1 = []
for i in all_features_ref1['video_path']:
  dnn = None
  if isinstance(i,str):
    path = i.replace("/content/content/reference_clip/","")
    if path not in dnn_feature_reference:
      print(path)
      break
    else:
      dnn = dnn_feature_reference[path]
  ref1.append(dnn)
all_features_ref1['dnn_feature'] = ref1


In [None]:
ref2 = []
for i in all_features_ref2['video_path']:
  dnn = None
  if isinstance(i,str):
    path = i.replace("/content/content/reference_clip/","")
    if path not in dnn_feature_reference:
      print(path)
      break
    else:
      dnn = dnn_feature_reference[path]
  ref2.append(dnn)
all_features_ref2['dnn_feature'] = ref2


In [None]:
all_features_df = all_features_df[['segment_id','AU_feature', 'dnn_feature', 'manual_feature']]
all_features_ref1 = all_features_ref1[['AU_feature', 'dnn_feature', 'manual_feature']]
all_features_ref2 = all_features_ref2[['AU_feature', 'dnn_feature', 'manual_feature']]


In [None]:
all_features_ref1

Unnamed: 0,AU_feature,dnn_feature,manual_feature
0,"[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.052595716, 0.06267054, -0.123, -0.41390204,...","[126.0, -23.982031966138386, 25.88700200677383..."
1,"[0.053015873015873, 0.0807590969862327, 0.0, -...","[0.07015944, 0.020164885, -0.05986896, -0.2490...","[126.0, -42.03995220760016, 27.480209132219365..."
2,"[0.1348412698412698, 0.2224328541451883, 6.0, ...","[-0.299179, -0.36308083, -0.21096529, 0.192069...","[126.0, 22.120582578287895, 22.3158813791212, ..."
3,"[0.1348412698412698, 0.2224328541451883, 6.0, ...","[-0.299179, -0.36308083, -0.21096529, 0.192069...","[126.0, 17.31989267131847, 23.09387887171484, ..."
4,"[0.088015873015873, 0.1128752928945557, 0.0, -...","[-0.018141264, -0.13290454, 0.06462509, 0.2290...","[126.0, 46.44568364985898, 36.5801491866292, 1..."
...,...,...,...
626,"[0.0781746031746031, 0.105666651644884, 0.0, -...","[-0.14228053, -0.19247176, -0.21233575, -0.017...","[126.0, -1.5842548036780788, 25.78576166560231..."
627,"[0.0775396825396825, 0.1268806463283444, 1.0, ...","[-0.15170558, -0.19269006, -0.2064976, -0.1806...","[121.0, 0.7987719690956768, 28.6503157046054, ..."
628,"[0.0775396825396825, 0.1268806463283444, 1.0, ...","[-0.044145454, -0.11295659, -0.00957068, -0.20...","[126.0, -1.2291309610521781, 22.14310785192592..."
629,"[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.0354932, 0.09398946, -0.4960579, -0.425001,...","[126.0, -8.625701743079695, 21.264191382523318..."


In [None]:
all_features_ref2

Unnamed: 0,AU_feature,dnn_feature,manual_feature
0,"[0.1348412698412698, 0.2224328541451883, 6.0, ...","[-0.299179, -0.36308083, -0.21096529, 0.192069...","[126.0, -3.078479217421869, 24.666117899730004..."
1,"[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.027060911, -0.08274244, 0.0400747, 0.069514...","[126.0, -1.0203919666780152, 26.30721485617076..."
2,"[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.037613317, -0.005479859, 0.007825111, -0.00...","[126.0, -3.270940913024445, 27.572546816724127..."
3,"[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.15345049, 0.123038046, -0.33340588, -0.4496...","[126.0, 6.043316631352067, 23.236435152206642,..."
4,"[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.102719806, 0.10402108, -0.085270084, -0.421...","[126.0, -18.488218431374484, 27.66351393040864..."
...,...,...,...
626,"[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.09140542, 0.11850408, -0.22142988, -0.53438...","[126.0, 1.1535208357646158, 24.580112330111366..."
627,"[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.054450084, -0.08387012, -0.12646388, -0.310...","[121.0, -7.012214102436831, 23.61696465074466,..."
628,"[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.066675745, 0.008914113, -0.15461351, -0.419...","[126.0, -5.059829658388944, 24.764254376271253..."
629,"[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.14313833, 0.03576659, 0.03736056, -0.309561...","[123.0, -7.829189114015408, 25.294581677016886..."


In [None]:

import pandas as pd
# Rename columns for clarity after slicing
all_features_ref1 = all_features_ref1.rename(columns={
    'AU_feature': 'AU_feature_ref1',
    'dnn_feature': 'dnn_feature_ref1',
    'manual_feature': 'manual_feature_ref1'
})

all_features_ref2 = all_features_ref2.rename(columns={
    'AU_feature': 'AU_feature_ref2',
    'dnn_feature': 'dnn_feature_ref2',
    'manual_feature': 'manual_feature_ref2'
})
all_features_combined = pd.concat([all_features_df, all_features_ref1, all_features_ref2], axis=1)

all_features_combined


Unnamed: 0,segment_id,AU_feature,dnn_feature,manual_feature,AU_feature_ref1,dnn_feature_ref1,manual_feature_ref1,AU_feature_ref2,dnn_feature_ref2,manual_feature_ref2
0,20210323-SP07F_clip_378_383,"[0.1441269841269841, 0.3001846627428381, 2.0, ...","[0.034613874, -0.22478016, 0.07269942, -0.3664...","[126.0, 12.609585210300457, 28.444309351049924...","[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.052595716, 0.06267054, -0.123, -0.41390204,...","[126.0, -23.982031966138386, 25.88700200677383...","[0.1348412698412698, 0.2224328541451883, 6.0, ...","[-0.299179, -0.36308083, -0.21096529, 0.192069...","[126.0, -3.078479217421869, 24.666117899730004..."
1,20210323-SP07F_clip_1113_1118,"[0.1441269841269841, 0.3001846627428381, 2.0, ...","[0.061545894, -0.25375453, -0.04545928, -0.193...","[125.0, 8.967601040954058, 26.538118969393903,...","[0.053015873015873, 0.0807590969862327, 0.0, -...","[0.07015944, 0.020164885, -0.05986896, -0.2490...","[126.0, -42.03995220760016, 27.480209132219365...","[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.027060911, -0.08274244, 0.0400747, 0.069514...","[126.0, -1.0203919666780152, 26.30721485617076..."
2,20210323-SP07F_clip_1074_1079,"[0.1441269841269841, 0.3001846627428381, 2.0, ...","[-0.009148686, -0.12560014, -0.08919125, -0.21...","[121.0, 16.190503685472198, 26.270694432031604...","[0.1348412698412698, 0.2224328541451883, 6.0, ...","[-0.299179, -0.36308083, -0.21096529, 0.192069...","[126.0, 22.120582578287895, 22.3158813791212, ...","[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.037613317, -0.005479859, 0.007825111, -0.00...","[126.0, -3.270940913024445, 27.572546816724127..."
3,20210323-SP07F_clip_1182_1187,"[0.1441269841269841, 0.3001846627428381, 2.0, ...","[-0.041140895, -0.088186085, 0.0039451574, -0....","[89.0, 4.982063863359799, 30.799108675192933, ...","[0.1348412698412698, 0.2224328541451883, 6.0, ...","[-0.299179, -0.36308083, -0.21096529, 0.192069...","[126.0, 17.31989267131847, 23.09387887171484, ...","[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.15345049, 0.123038046, -0.33340588, -0.4496...","[126.0, 6.043316631352067, 23.236435152206642,..."
4,20210323-SP07F_clip_255_260,"[0.1441269841269841, 0.3001846627428381, 2.0, ...","[-0.017098123, -0.22169487, 0.061171174, -0.33...","[122.0, 12.994270436686323, 28.811799580626968...","[0.088015873015873, 0.1128752928945557, 0.0, -...","[-0.018141264, -0.13290454, 0.06462509, 0.2290...","[126.0, 46.44568364985898, 36.5801491866292, 1...","[0.2984126984126984, 0.581175928886822, 3.0, 3...","[0.102719806, 0.10402108, -0.085270084, -0.421...","[126.0, -18.488218431374484, 27.66351393040864..."
...,...,...,...,...,...,...,...,...,...,...
626,20220916-SP07F_clip_170_175,"[0.2068253968253968, 0.4341200770176864, 4.0, ...","[-0.08298848, -0.21334194, -0.23492509, -0.169...","[99.0, 3.015383967411516, 32.090275432264924, ...","[0.0781746031746031, 0.105666651644884, 0.0, -...","[-0.14228053, -0.19247176, -0.21233575, -0.017...","[126.0, -1.5842548036780788, 25.78576166560231...","[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.09140542, 0.11850408, -0.22142988, -0.53438...","[126.0, 1.1535208357646158, 24.580112330111366..."
627,20220916-SP07F_clip_0_5,"[0.2068253968253968, 0.4341200770176864, 4.0, ...","[-0.0563802, -0.22218841, -0.24426143, -0.1588...","[119.0, 12.12459926234018, 30.185529291105983,...","[0.0775396825396825, 0.1268806463283444, 1.0, ...","[-0.15170558, -0.19269006, -0.2064976, -0.1806...","[121.0, 0.7987719690956768, 28.6503157046054, ...","[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.054450084, -0.08387012, -0.12646388, -0.310...","[121.0, -7.012214102436831, 23.61696465074466,..."
628,20220916-SP07F_clip_235_240,"[0.2068253968253968, 0.4341200770176864, 4.0, ...","[-0.10170282, -0.21903856, -0.11246258, -0.036...","[125.0, 17.00454538866136, 31.420981086640406,...","[0.0775396825396825, 0.1268806463283444, 1.0, ...","[-0.044145454, -0.11295659, -0.00957068, -0.20...","[126.0, -1.2291309610521781, 22.14310785192592...","[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.066675745, 0.008914113, -0.15461351, -0.419...","[126.0, -5.059829658388944, 24.764254376271253..."
629,20220916-SP07F_clip_5_10,"[0.2068253968253968, 0.4341200770176864, 4.0, ...","[0.03087373, -0.23887897, -0.102511175, -0.139...","[126.0, 20.894380496320373, 31.906573152657277...","[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.0354932, 0.09398946, -0.4960579, -0.425001,...","[126.0, -8.625701743079695, 21.264191382523318...","[0.0775396825396825, 0.1268806463283444, 1.0, ...","[0.14313833, 0.03576659, 0.03736056, -0.309561...","[123.0, -7.829189114015408, 25.294581677016886..."


In [None]:
# prompt: remove row contain non in all_features_combined

all_features_combined.dropna(inplace=True)

In [None]:

with open('/content/drive/MyDrive/data/all_features_combined.pickle', 'wb') as f:
    pickle.dump(all_features_combined, f)
with open('/content/drive/MyDrive/data/label_df_3.pickle', 'wb') as f:
    pickle.dump(label_df_3, f)

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt



In [None]:
# --- 1. Load your data ---

with open('/content/drive/MyDrive/data/all_features_combined.pickle', 'rb') as f:
    all_features_combined_df = pickle.load(f)

with open('/content/drive/MyDrive/data/label_df_3.pickle', 'rb') as f:
    label_df_3 = pickle.load(f)
print(label_df_3['majority_label_3'].value_counts())

majority_label_3
Active engagement          50
Intermittent engagement    32
Disengagement              20
Name: count, dtype: int64
