# Initial configuration

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
CACHE_DIR = "/scratch/chaijy_root/chaijy0/sstorks/.cache/"

# CaptainCook4D

# EPIC KITCHENS

In [11]:
import pandas as pd

# PARTITION = "validation"
PARTITION = "train"
EK_ANNOTATION_PATH = f"/nfs/turbo/coe-chaijy/datasets/EPIC-KITCHENS/annotations/EPIC_100_{PARTITION}_full_sent.csv"
annotations = pd.read_csv(EK_ANNOTATION_PATH)
EK_55_VIDEO_PATH = f"/nfs/turbo/coe-chaijy/datasets/EPIC-KITCHENS/55"
EK_100_VIDEO_PATH = f"/nfs/turbo/coe-chaijy/datasets/EPIC-KITCHENS/100"


# # Load visual annotations
# NOUN_CLASSES_PATH = "/nfs/turbo/coe-chaijy/datasets/EPIC-KITCHENS-VISOR/2v6cgv1x04ol22qp9rm9x2j6a7/EPIC_100_noun_classes_v2.csv"
# EK_VISUAL_ANNOTATION_PATH = f"/nfs/turbo/coe-chaijy/datasets/EPIC-KITCHENS-VISOR/2v6cgv1x04ol22qp9rm9x2j6a7/GroundTruth-SparseAnnotations/annotations/{PARTITION}"

## 1 recipe, 2 video clips formulation

Collect lists of clips for all videos in EPIC KITCHENS:

In [12]:
from pprint import pprint
from tqdm import tqdm
from collections import defaultdict
from datetime import datetime
import json
import os
import numpy as np

structured_action_lists = defaultdict(dict)

for row_idx, row in tqdm(annotations.iterrows(), total=len(annotations)):
    
    participant_index, video_index, clip_index = row['narration_id'].split('_')
    video_index = participant_index + "_" + str(video_index)
    clip_index = int(clip_index)

    narration_text = row['full_sent_narration']
    
    start_timestamp = row['start_timestamp']
    start_timestamp = datetime.strptime(start_timestamp, "%H:%M:%S.%f")
    stop_timestamp = row['stop_timestamp']
    stop_timestamp = datetime.strptime(stop_timestamp, "%H:%M:%S.%f")
    action_seconds = stop_timestamp - start_timestamp
    action_seconds = action_seconds.total_seconds()
    
    verb = row['verb']
    noun = row['noun'] # TODO: "all_nouns" key gives access to other noun participants where applicable

    structured_action_lists[video_index][clip_index] = {
        "video_id": video_index,
        "clip_index": clip_index,
        "verb": verb,
        "noun": noun,
        "all_nouns": eval(row["all_nouns"]),
        "narration_text": narration_text,
        "action_seconds": action_seconds,
        "start_timestamp": row['start_timestamp'],
        "stop_timestamp": row['stop_timestamp'],
    }
    
print(f"{len(structured_action_lists)} videos collected from EPIC KITCHENS")

# smooth out to a dict of lists
for video_id in structured_action_lists:
    min_clip_id = min(list(structured_action_lists[video_id].keys()))
    max_clip_id = max(list(structured_action_lists[video_id].keys()))
    
    new_list = []
    last_action = None
    for clip_id in range(min_clip_id, max_clip_id+1):
        
        if clip_id not in structured_action_lists[video_id]:
            continue
        
        current_action = structured_action_lists[video_id][clip_id]
        new_list.append(current_action)
        
    structured_action_lists[video_id] = new_list

100%|██████████| 67217/67217 [00:05<00:00, 12288.41it/s]

495 videos collected from EPIC KITCHENS





Retrieve clips for TRAVEl examples:

In [20]:
from pprint import pprint

query_verb = "roll"
query_nouns = {"tortilla",}

retrieved_clips = []
for video in structured_action_lists:
    for clip in structured_action_lists[video]:
        if clip['verb'] == query_verb and set(clip['all_nouns']) == set(query_nouns):
            retrieved_clips.append(clip)

print(f"{len(retrieved_clips)} clips retrieved from {PARTITION} partition of EK-100")
videos_to_look_at = list(set([clip['video_id'] for clip in retrieved_clips]))
print("In videos:", videos_to_look_at)

for clip in retrieved_clips:
    pprint(
        {
            "video_id": clip['video_id'],
            # "clip_index": clip['clip_index'],
            # "verb": clip['verb'],
            # "all_nouns": clip['all_nouns'],
            "start_timestamp": clip['start_timestamp'],
            "stop_timestamp": clip['stop_timestamp'],            
        }
    )

1 clips retrieved from train partition of EK-100
In videos: ['P06_106']
{'start_timestamp': '00:01:10.89',
 'stop_timestamp': '00:01:14.72',
 'video_id': 'P06_106'}


Retrieve videos for clips:

In [6]:
import torch
from video_blip.data.epic_kitchens import EpicKitchensDataset

dataset = EpicKitchensDataset(EK_ANNOTATION_PATH,
                              EK_55_VIDEO_PATH,
                              EK_100_VIDEO_PATH,
                              )

In [8]:
for item in dataset:
    pprint(item)
    break

{'all_noun_classes': '[3]',
 'all_nouns': "['cupboard']",
 'aug_index': 0,
 'clip_index': 0,
 'full_sent_narration': 'The camera wearer opens the cupboard.',
 'narration': 'open cupboard',
 'narration_id': 'P25_106_0',
 'narration_timestamp': '00:00:03.115',
 'narration_timestamp_sec': 3.115,
 'noun': 'cupboard',
 'noun_class': '3',
 'participant_id': 'P25',
 'start_frame': '111',
 'start_timestamp': '00:00:02.22',
 'stop_frame': '177',
 'stop_timestamp': '00:00:03.55',
 'verb': 'open',
 'verb_class': '3',
 'video': tensor([[[[ 87,  87,  88,  ..., 128, 129, 129],
          [ 87,  88,  88,  ..., 129, 129, 129],
          [ 88,  88,  89,  ..., 129, 129, 129],
          ...,
          [ 41,  41,  41,  ..., 163, 163, 163],
          [ 41,  41,  41,  ..., 162, 163, 163],
          [ 41,  41,  41,  ..., 162, 163, 163]],

         [[ 91,  92,  91,  ..., 128, 129, 129],
          [ 92,  92,  92,  ..., 129, 129, 129],
          [ 92,  92,  93,  ..., 129, 129, 129],
          ...,
          [ 41