Use SocialGNN_encoding_2 conda environment

To facilitate graph creation andnhelp standardize graph size, we only kept clips with at least 2 people and a maximum of 5 entities (people + objects)

 node features were obtained by passing pixel information within that entity’s bounding box through a pretrained VGG19 network30. The output from the penultimate fully connected layer was reduced to 20 dimensions via PCA and this feature vector was appended with the 4D coordinates of the bounding box (representing the location and size of the entity) and a boolean variable denoting whether it was an agent (person) or an object. 

In [1]:
import os
import numpy as np
import pandas as pd
import cv2
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
n_components = 20

In [3]:
video_input_path = '../Data/dyad_videos_3000ms'
annotation_input_path = '../Data/preprocess/annotations.csv'
behavioral_ratings_path = '../Data/behavioral_ratings.csv'
patches_output_path = '../Data/preprocess/video_data/'
pca_dir = "../Data/preprocess/fitted_PCA"

In [4]:
ratings_of_interest =['spatial expanse', 'object directed','interagent distance', 'agents facing', 'communication',  'joint action', 'valence', 'arousal']

In [5]:
def load_pickle(path):
    with open(path, 'rb') as f:
        pickled = pickle.load(f)
    return pickled

In [6]:
def save_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

## CROP OUT IMAGES PATCHES FROM VIDEOS

In [None]:
annotations = pd.read_csv(annotation_input_path)
annotations

In [None]:
behavioral_ratings = pd.read_csv(behavioral_ratings_path)
behavioral_ratings[behavioral_ratings['video_name']=='-YwZOeyAQC8_15.mp4'][ratings_of_interest].values.tolist()[0]

In [None]:
# Iterate over each unique video in the DataFrame
for video_name in tqdm(annotations['video_name'].unique()):
    # print("Processing video:", video_name)
    # Get ratings from this video
    video_ratings = behavioral_ratings[behavioral_ratings['video_name']==video_name][ratings_of_interest].values.tolist()[0]

    # Fetch all frames annotations in this video
    save_path = os.path.join(patches_output_path, video_name)
    os.makedirs(save_path, exist_ok=True)
    video = cv2.VideoCapture(os.path.join(video_input_path, video_name))
    video_annotations = annotations[annotations['video_name'] == video_name]
    patches = []
    annotations_dict = {'labels': [], 'gazes': [], 'frame_numbers': [], 
                   'left': [], 'right': [], 'top': [], 'bottom': []}
    
    for current_frame in range(1, 91):
        successful_read, frame = video.read()
        if successful_read:
            # Filter annotations for the current frame
            frame_annotations = video_annotations[video_annotations['frame'] == current_frame]
            if not frame_annotations.empty:
                for _, entity in frame_annotations.iterrows():
                    patches.append(frame[int(entity['top']):int(entity['bottom']),int(entity['left']):int(entity['right'])])
                    annotations_dict['labels'].append(entity['label_name'])
                    annotations_dict['gazes'].append(entity['gaze_direction'])   
                    annotations_dict['frame_numbers'].append(current_frame)
                    annotations_dict['left'].append(int(entity['left']))
                    annotations_dict['right'].append(int(entity['right']))
                    annotations_dict['top'].append(int(entity['top']))
                    annotations_dict['bottom'].append(int(entity['bottom']))
                    
        else:
            raise ValueError(f"Unsuccessful read frame {current_frame} of {video_name}")
    save_pickle(patches, os.path.join(save_path, 'patches'))
    annotation_df = pd.DataFrame(annotations_dict)
    save_pickle(annotation_df, os.path.join(save_path, 'annotations'))
    save_pickle(video_ratings, os.path.join(save_path, 'ratings'))


In [None]:
def visualize_patches(num_patches_to_display=10):
    # Get a list of all subdirectories in the patches_output_path directory
    videos = [d for d in os.listdir(patches_output_path)]
    import random
    # Randomly select one of the subdirectories
    selected_subdir = random.choice(videos)
    selected_path = os.path.join(patches_output_path, selected_subdir)
    
    patches = load_pickle(os.path.join(selected_path, 'patches'))
    annot = load_pickle(os.path.join(selected_path, 'annotations'))
    
    # Display each patch with its corresponding labels
    print(len(patches))
    for i, patch in enumerate(patches):
        print(annot.loc[i, 'frame_numbers'])
        print(annot.loc[i, 'labels'])
        print(annot.loc[i, 'gazes'])
        plt.imshow(cv2.cvtColor(patch, cv2.COLOR_BGR2RGB))
        plt.axis('off')
        plt.show()
        if i+2> num_patches_to_display:
            break


In [None]:
visualize_patches(5)

## VGG FEATURES

In [None]:
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.preprocessing.image import smart_resize
from tensorflow.keras.models import Model

In [None]:
def reshape_patches(x):
    temp = np.expand_dims(x, axis=0)
    temp2 = preprocess_input(smart_resize(temp, (224,224)))
    return temp2[0]

In [None]:
base_model = VGG19(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

In [None]:
restart = input('Do you want to reprocess the input videos? (y/n)')

In [None]:
for video in tqdm(os.listdir(patches_output_path)):
    video_dir = os.path.join(patches_output_path, video)
    patch_dir = os.path.join(video_dir, "patches")
    out_dir = os.path.join(video_dir, "VGG19_patches")
    if not os.path.exists(out_dir) or restart == 'y':
        patches = load_pickle(patch_dir)
        reshaped_patches  = [reshape_patches(patch) for patch in patches]
        x = np.array(reshaped_patches)
        y = model.predict(x)
        save_pickle(y, out_dir)
        # print(f"VGG19 patches saved to {video_dir}")


## fit PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
def extract_all_vggfeatures():
    all_features = []
    for video in tqdm(os.listdir(patches_output_path)):
        patch_dir = os.path.join(patches_output_path, video, "VGG19_patches") 
        all_features.extend(load_pickle(patch_dir))
        
    all_features = np.array(all_features)
    print(all_features.shape)
    return all_features

In [None]:
def fit_pca(all_features):
    pca = PCA(n_components=20)
    scaler = StandardScaler()
    all_features_scaled = scaler.fit_transform(all_features)
    pca.fit(all_features_scaled)
    return pca, scaler

In [None]:
pca, scaler = fit_pca(extract_all_vggfeatures())
save_pickle(pca, os.path.join(pca_dir, f"{n_components}pca"))
save_pickle(scaler, os.path.join(pca_dir, f"{n_components}scaler"))

In [None]:
# # Assuming your VGG feature matrix is called vgg_features (shape: [n_samples, n_features])
# vgg_features = extract_all_vggfeatures()
# 
# # Apply PCA
# pca = PCA()
# pca.fit(vgg_features)
# 
# # Get the explained variance ratio for each principal component
# explained_variance_ratio = pca.explained_variance_ratio_
# 
# # Calculate the cumulative explained variance
# cumulative_variance_explained = np.cumsum(explained_variance_ratio)
# 
# # Plot the cumulative explained variance
# plt.figure(figsize=(8, 6))
# plt.plot(cumulative_variance_explained, marker='o')
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.title('Variance Explained by PCA Components')
# plt.grid(True)
# 
# from kneed import KneeLocator  # External library for detecting the elbow
# # Detect the elbow point
# kneedle = KneeLocator(range(1, len(cumulative_variance_explained) + 1), 
#                       cumulative_variance_explained, 
#                       curve='concave', 
#                       direction='increasing')
# 
# elbow_point = kneedle.elbow
# plt.axvline(x=elbow_point, color='r', linestyle='--', label=f'Elbow at {elbow_point}')
# plt.legend()
# plt.show()
# 
# # Print the elbow point
# print(f'The elbow is located at component {elbow_point}')
# 
# 
# plt.show()


In [None]:
# # Find the number of dimensions that explain 90% to 95% variance
# min_dim_90 = np.argmax(cumulative_variance_explained >= 0.90) + 1  # Adding 1 to get the actual number of components
# max_dim_95 = np.argmax(cumulative_variance_explained >= 0.95) + 1  # Adding 1 to get the actual number of components
# min_dim_75 = np.argmax(cumulative_variance_explained >= 0.75) + 1
# # Print the results
# print(f'Number of dimensions that explain at least 90% variance: {min_dim_90}')
# print(f'Number of dimensions that explain at least 95% variance: {max_dim_95}')
# 
# print(f'Number of dimensions that explain at least 75% variance: {min_dim_75}')

## PCA on VGG features

In [None]:
pca = load_pickle(os.path.join(pca_dir, f"{n_components}pca"))
scaler = load_pickle(os.path.join(pca_dir, f'{n_components}scaler'))

In [None]:
for video in tqdm(os.listdir(patches_output_path)):
    video_dir = os.path.join(patches_output_path, video)
    patch_dir = os.path.join(video_dir, "VGG19_patches")
    vgg_features = load_pickle(patch_dir)
    scaled_features = scaler.transform(vgg_features)
    pca_features = pca.transform(scaled_features)
    save_pickle(pca_features, os.path.join(video_dir, f"{n_components}pca_features"))    

## Node Features


In [None]:
for video in tqdm(os.listdir(patches_output_path)):
    video_dir = os.path.join(patches_output_path, video)
    pca_features = load_pickle(os.path.join(video_dir, f"{n_components}pca_features"))
    video_annot = load_pickle(os.path.join(video_dir, "annotations"))
    new_features = []
    for i, patch_feature in enumerate(pca_features):
        new_feature = np.append(patch_feature, [video_annot['top'][i], video_annot['bottom'][i], video_annot['left'][i], video_annot['right'][i]])
        new_feature = np.append(new_feature, [0] if video_annot['labels'][i] in ['head1', 'head2'] else [1])
        assert len(new_feature) == n_components + 5
        new_features.append(new_feature)
    video_annot['features'] = new_features
    save_pickle(video_annot, os.path.join(video_dir, "annotations"))

In [None]:
video_annot

## Split into Sequences


In [11]:
def split_sequences(dir, df):
    splits = []
    start_index = 0
    
    # Get the unique frame numbers
    unique_frames = df['frame_numbers'].unique()
    
    # Create a pair for the first frame
    frame_labels = df[df['frame_numbers'] == unique_frames[0]]['labels'].tolist()
    
    # Copy the list using list slicing
    previous_pairs = frame_labels[:]
    print(previous_pairs)
    
    for i in range(1, len(unique_frames)):
        frame = unique_frames[i]
        # Create a pair for the current frame, handling NaNs        
        frame_labels = df[df['frame_numbers'] == frame]['labels'].tolist()
        current_pairs = frame_labels[:]
        
        # Check if there's any difference in pairs
        if current_pairs != previous_pairs:
            
            # Update the previous pairs
            # Sequence must have at least two people
            if 'head1' in previous_pairs and 'head2' in previous_pairs:
                splits.append(df[df['frame_numbers'].isin(unique_frames[start_index:i])])
            previous_pairs = current_pairs[:]
            print(previous_pairs, i)
            start_index = i

    # Append the last segment
    if 'head1' in previous_pairs and 'head2' in previous_pairs:
        splits.append(df[df['frame_numbers'].isin(unique_frames[start_index:])])
    # Save all the sequences
    for i, split in enumerate(splits):
        save_pickle(split, os.path.join(dir, f"sequence_{i}"))

In [12]:
for video in tqdm(os.listdir(patches_output_path)):
    video_dir = os.path.join(patches_output_path, video)
    video_annot = load_pickle(os.path.join(video_dir, "annotations"))
    split_sequences(video_dir, video_annot)
    break

  0%|          | 0/250 [00:00<?, ?it/s]

['head1', 'head2', 'object1']
['head1', 'object1'] 57
['head1', 'head2', 'object1'] 61
['head1', 'object1'] 69
['head1', 'head2', 'object1'] 73
['head1', 'object1'] 85
['head1', 'head2', 'object1'] 87





## Build Graph

In [7]:
def process_sequence(sequence_annotations):
    
    all_labels = ['head1', 'head2', 'object1', 'object2']
    sequence_labels = sequence_annotations['labels'].unique().tolist()
    # Process in the order of the all_labels list
    entities = []
    for entity in all_labels:
        if entity in sequence_labels:
            entities.append(entity)

    grouped = sequence_annotations.groupby('frame_numbers')
    graph_dicts_frames = []
    for frame_number, group in grouped:
        senders, receivers, nodes = [], [], []
        # print(entities)
        for i, entity in enumerate(entities):
            # Access the value in the 'features' and 'gazes' columns of that entity
            feature = group.loc[group['labels'] == entity, 'features'].iloc[0].tolist()

                
            nodes.append(feature)
            edge = group.loc[group['labels'] == entity, 'gazes'].iloc[0]
            # if gaze exists (only when the entity is person)
            if not isinstance(edge, float):
                sender, receiver = edge.split(', ')
                # if the gaze is at some entities not found in the video, the gaze will be discarded
                if sender == entity and receiver in entities and sender != receiver: 
                    senders.append(i)
                    receivers.append(entities.index(receiver))
                # else:
                #     print(f'unrecognised gaze {edge} in {entities}')
        while len(nodes) < 4:
            nodes.append([0 for _ in range(n_components+5)])
            
        assert len(nodes) == 4
        graph_dict = {'nodes': nodes, 'senders': senders, 'receivers': receivers}
        graph_dicts_frames.append(graph_dict)
    return graph_dicts_frames

In [8]:
all_videos = {}
for video_name in tqdm(sorted(os.listdir(patches_output_path))):
    # if video_name != '29.mp4':
    #     continue
    video_dir = os.path.join(patches_output_path, video_name)
    video_data = {'graph_dicts': [],
                  # 'sequences': [],
                  'labels': load_pickle(os.path.join(video_dir, "ratings"))}    
    num_seq = len([s for s in os.listdir(video_dir) if s.startswith('sequence_')])
    for pickle_idx in range(num_seq):
        pickle_name = f'sequence_{str(pickle_idx)}'
        
        sequence_annotations = load_pickle(os.path.join(video_dir, pickle_name))
        # if video_name == '144.mp4' and pickle_idx == 2:
        #     print(sequence_annotations['labels'])
        #     raise ValueError
        graph_dicts_sequence = process_sequence(sequence_annotations)
        video_data['graph_dicts'].append(graph_dicts_sequence)
    all_videos[video_name.replace('.mp4', '')] = video_data
    # if video_name == '29.mp4':
    #     print(graph_dicts_sequences[0])

100%|██████████| 250/250 [00:16<00:00, 15.25it/s]


In [9]:
save_pickle(all_videos, "../Data/preprocess/graphs")

## Input Structure

In [10]:
all_videos = load_pickle("../Data/preprocess/graphs")
structured_graph_dir = '../Data/preprocess/structured_graph'

In [11]:
structured_graph = []
for video_name in tqdm(all_videos.keys()):
    for seq_idx, seq in enumerate(all_videos[video_name]['graph_dicts']):
        seq_dict = {'label': all_videos[video_name]['labels'],
                    'graph_dicts': all_videos[video_name]['graph_dicts'][seq_idx]}
        structured_graph.append(seq_dict)
save_pickle(structured_graph, structured_graph_dir)

100%|██████████| 250/250 [00:00<?, ?it/s]


## Bootstrapping

In [None]:
# from sklearn.model_selection import train_test_split

In [None]:
# all_videos = load_pickle("../Data/preprocess/graphs")
# bootstrapping_dir = '../Data/preprocess/bootstrapped'
# bootstrapping = 10

In [None]:
# myKeys = list(all_videos.keys())
# myKeys.sort()
# all_videos.keys()

In [None]:
# for date in ['27Sep']:
#     ran_state = 13 if date =='27Sep' else 27
#     for i in range(bootstrapping):
#         all_sequences = []
#         seq_train_idx = []
#         seq_test_idx = []
#         V_train_idx, V_test_idx = train_test_split(list(all_videos.keys()), random_state=ran_state+i)
#         print("Train Videos", len(V_train_idx), "Test Videos", len(V_test_idx))
#         for video_idx in V_train_idx:
#             ## return a list of tuple, triple, ... each of them is a sequence event label
#             ## For example: [('SingleGaze', 'SingleGaze'), ('GazeFollow', 'GazeFollow')]
#             for seq_idx, seq in enumerate(all_videos[video_idx]['graph_dicts']):
#                 seq_dict = {'label': all_videos[video_idx]['labels'],
#                             'graph_dicts': all_videos[str(video_idx)]['graph_dicts'][seq_idx]}
#                 seq_train_idx.append(len(all_sequences))
#                 all_sequences.append(seq_dict)
#                     # if len(all_sequences) == 312+1 and i == 0:
#                     #     print(len(seq_labels))
#                     #     print(video_idx, seq_idx)
#                     #     print(all_videos[str(video_idx)]['graph_dicts'][seq_idx][0])
#                     
#         for video_idx in V_test_idx:
#             for seq_idx, seq in enumerate(all_videos[video_idx]['graph_dicts']):
#                 seq_dict = {'label': seq,
#                             'graph_dicts': all_videos[video_idx]['graph_dicts'][seq_idx]}
#                 seq_test_idx.append(len(all_sequences))
#                 all_sequences.append(seq_dict)
#                     
#         print("Train Seqs", len(seq_train_idx), "Test Seqs", len(seq_test_idx))
#         output_dir = os.path.join(bootstrapping_dir, f"{date}_{i}.pkl")
#         with open(output_dir, 'wb') as f:
#             pickle.dump(all_sequences, f)
#             pickle.dump(seq_train_idx, f)
#             pickle.dump(seq_test_idx, f)

dictionary structure:
 video index(keys) --> sequences (index) --> frames (index) --> nodes & senders & receivers
 
    ## outer loops: video --> multiple sequences --> multiple frames
    ## Under one frame: feature numbers, senders and reveicers in all edges.
    print(len(V[key]['graph_dicts'][0][0]['nodes']), len(V[key]['graph_dicts'][0][0]['senders']), len(V[key]['graph_dicts'][0][0]['receivers']))
    ## number of frames in a sequence
    print(len(V[key]['graph_dicts'][0]))
    ## number of sequences in a video
    print(len(V[key]['graph_dicts']))
    
  
I guess let me try not to divide the sequences first
so I would have 

video index (keys) --> frames(index) --> nodes & senders & receivers

I also didn't do bootstrapping