# Initial configuration

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

CACHE_DIR = "/scratch/chaijy_root/chaijy0/sstorks/.cache/"

# EPIC KITCHENS

In [None]:
import pandas as pd
import sys
sys.path.append("./VideoBLIP-internal")

from video_blip.data.epic_kitchens import EpicKitchensDataset

PARTITION = "validation"
# PARTITION = "train"
EK_ANNOTATION_PATH = f"/nfs/turbo/coe-chaijy/datasets/EPIC-KITCHENS/annotations/EPIC_100_{PARTITION}_full_sent.csv"
annotations = pd.read_csv(EK_ANNOTATION_PATH)

# Load visual annotations
NOUN_CLASSES_PATH = "/nfs/turbo/coe-chaijy/datasets/EPIC-KITCHENS-VISOR/2v6cgv1x04ol22qp9rm9x2j6a7/EPIC_100_noun_classes_v2.csv"
EK_VISUAL_ANNOTATION_PATH = f"/nfs/turbo/coe-chaijy/datasets/EPIC-KITCHENS-VISOR/2v6cgv1x04ol22qp9rm9x2j6a7/GroundTruth-SparseAnnotations/annotations/{PARTITION}"

## 1 recipe, 2 video clips formulation

In [None]:
from pprint import pprint
from tqdm import tqdm
from collections import defaultdict
from datetime import datetime
import json
import os
import numpy as np

structured_action_lists = defaultdict(dict)

for row_idx, row in tqdm(annotations.iterrows(), total=len(annotations)):
    
    participant_index, video_index, clip_index = row['narration_id'].split('_')
    video_index = participant_index + "_" + str(video_index)
    clip_index = int(clip_index)

    narration_text = row['full_sent_narration']
    
    start_timestamp = row['start_timestamp']
    start_timestamp = datetime.strptime(start_timestamp, "%H:%M:%S.%f")
    stop_timestamp = row['stop_timestamp']
    stop_timestamp = datetime.strptime(stop_timestamp, "%H:%M:%S.%f")
    action_seconds = stop_timestamp - start_timestamp
    action_seconds = action_seconds.total_seconds()
    
    verb = row['verb']
    noun = row['noun'] # TODO: "all_nouns" key gives access to other noun participants where applicable

    structured_action_lists[video_index][clip_index] = {
        "video_id": video_index,
        "clip_index": clip_index,
        "verb": verb,
        "noun": noun,
        "all_nouns": eval(row["all_nouns"]),
        "narration_text": narration_text,
        "action_seconds": action_seconds,
        "start_timestamp": row['start_timestamp'],
        "stop_timestamp": row['stop_timestamp'],
    }
    
print(f"{len(structured_action_lists)} videos collected from EPIC KITCHENS")

# smooth out to a dict of lists
for video_id in structured_action_lists:
    min_clip_id = min(list(structured_action_lists[video_id].keys()))
    max_clip_id = max(list(structured_action_lists[video_id].keys()))
    
    new_list = []
    last_action = None
    for clip_id in range(min_clip_id, max_clip_id+1):
        
        if clip_id not in structured_action_lists[video_id]:
            continue
        
        current_action = structured_action_lists[video_id][clip_id]
        new_list.append(current_action)
        
    structured_action_lists[video_id] = new_list

N_ACTIONS_PER_VIDEO = 5
N_SECONDS_PER_CHUNK = 60
travel_examples = []
for video_id in structured_action_lists:
        
    time_so_far = 0.0
    clips_so_far = []
    for clip in structured_action_lists[video_id]:
        
        time_so_far += clip['action_seconds']
        clips_so_far.append(clip)
        actions_in_clip = set([(action['verb'], action['noun']) for action in clips_so_far])
        
        # Create a TRAVEl example for every 1min of video in EK
        if time_so_far >= N_SECONDS_PER_CHUNK and len(actions_in_clip) >= N_ACTIONS_PER_VIDEO * 2:
            
            # Sample the 5 longest actions as the key actions for this video
            key_actions = sorted(clips_so_far, key=lambda x: x['action_seconds'], reverse=True)

            # Remove duplicates ordered by action length
            key_actions.reverse()
            remove_indices = []
            for action_idx, action in enumerate(key_actions):
                if (action['verb'], action['noun']) in [(comp_action['verb'], comp_action['noun']) for comp_action in key_actions[action_idx+1:]]:
                    remove_indices.append(action_idx)
                # Don't consider some actions as the key actions, e.g., look for, which is not so much a physical action affecting objects
                elif action['verb'] in ['look-for', 'eat', 'drink']:
                    remove_indices.append(action_idx)
            remove_indices.reverse()
            for action_idx in remove_indices:
                del key_actions[action_idx]
            key_actions.reverse()

            key_actions = key_actions[:N_ACTIONS_PER_VIDEO]
            key_actions = sorted(key_actions, key=lambda x: x['clip_index'], reverse=False)
            
            assert len(key_actions) == N_ACTIONS_PER_VIDEO, f"Only collected {len(key_actions)} actions in this example!"            
            
            min_clip_id = min([action['clip_index'] for action in clips_so_far])
            max_clip_id = max([action['clip_index'] for action in clips_so_far])  
            total_time = sum([clip['action_seconds'] for clip in clips_so_far]) / 60.0
            example = {
                "video_id": video_id,
                "key_clips": key_actions,
                "start_clip_index": min_clip_id,
                "end_clip_index": max_clip_id,
                "all_clip_indices": [clip['clip_index'] for clip in clips_so_far],
                "n_clips": len(clips_so_far),
                "total_minutes": total_time,
                "all_narrations": [action['narration_text'].split('.')[0].strip() + "." for action in key_actions],
            }
            
            # Compile annotated objects seen during this example
            all_objects = []
            for action in clips_so_far:
                for noun in action['all_nouns']:
                    all_objects.append(noun)
            # for action in key_actions:
            #     vis_ann_path = os.path.join(EK_VISUAL_ANNOTATION_PATH, f"{video_id}.json")
            #     if os.path.exists(vis_ann_path):
            #         vis_annotations = json.load(open(vis_ann_path, "r"))
            #         for ann in vis_annotations['annotations']:
            #             all_objects.append(ann["name"])
            #     else:
            #         print(f"Warning: Couldn't find visual annotations for video {video_id} clip {action['clip_index']}!")
            all_objects = list(set(all_objects))
            example["scene_objects"] = all_objects
            
            travel_examples.append(example)
            
            clips_so_far = []
            time_so_far = 0.0

average_clips = np.mean([example['n_clips'] for example in travel_examples])
average_time = np.mean([example['total_minutes'] for example in travel_examples])
print(f"{len(travel_examples)} TRAVEl examples created")
print(f"Average # action clip per examples: {average_clips}")
print(f"Average time per example (minutes): {average_time}")
print(f"Average time of an action (seconds): {average_time / average_clips * 60.0}")
pprint(travel_examples)

# TODO: collect list of objects in clips from visual annotations? Seems VISOR data is incomplete - need to refer to Peter's code for epic kitchens
# -- Should actually do this and maybe restrict data to only VISOR-annotated data - may want these masks later for our approach? Can also use them for grounding in Kosmos-2?
# -- Maybe do this instead of over-sampling?
# TODO: find a way to make examples include fewer clips? Or filter away less important actions?
# -- We can cluster very quick actions together.
# TODO: some actions next to each other are very similar or hypo/hypernyms of each other, e.g., "wash glass" -> "rinse glass". We should avoid this or cluster. 
# -- Maybe consecutive actions that exceed some threshold similarity should be clustered?
# -- "WayOf" relations in ConceptNet can inform what actions should be clustered/ignored.