# Initial configuration

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch, os
print(torch.__version__)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
CACHE_DIR = "/scratch/chaijy_root/chaijy0/sstorks/.cache/huggingface"
os.environ['HF_HOME'] = CACHE_DIR

1.13.0+cu117
cuda


In [3]:
!nvidia-smi

Tue Jan 30 17:03:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.06              Driver Version: 545.23.06    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     On  | 00000000:1E:00.0 Off |                    0 |
|  0%   34C    P0              56W / 300W |      7MiB / 46068MiB |      0%   E. Process |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# CaptainCook4D Toy Experiments Empirical Experiments

## Data loading

In [4]:
VIDEO_DIR = "/nfs/turbo/coe-chaijy-unreplicated/datasets/captaincook4d/data/captain_cook_4d/hololens/sync/pv" # Directory containing CaptainCook4D mp4s
ANNOTATIONS_DIR = "/nfs/turbo/coe-chaijy-unreplicated/datasets/captaincook4d/annotations"

Boilerplate code to load video frames from video files (from GPT4):

In [5]:
import cv2
import numpy as np

def get_video(video_path):
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        raise IOError("Cannot open video file")
    
    return cap
    # remember to call cap.release() later

def extract_frames(cap, times):
    fps = cap.get(cv2.CAP_PROP_FPS)  # Frames per second
    frames = []

    for t in times:
        frame_number = int(t * fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        ret, frame = cap.read()

        if ret:
            # Convert to RGB
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)            
            frames.append(frame)
        else:
            print(f"Warning: Frame at time {t} seconds could not be read.")
            frames.append(None)

    return frames

Other utils functions:

In [6]:
def generate_float_series(start, end, step):
    # Ensure step is a positive float
    step = abs(step)

    # Initialize the series with the start value
    series = [start]

    # Generate numbers in the series
    while start + step <= end:
        start += step
        series.append(start)

    # Check if the end value is already in the series
    if series[-1] != end:
        series.append(end)

    return series

Data classes:

In [7]:
from torch.utils.data import Dataset
from PIL import Image
import json
from dataclasses import dataclass
from typing import Optional

ERROR_CATEGORIES = json.load(open(os.path.join(ANNOTATIONS_DIR, "annotation_json/error_category_idx.json"), "r"))

@dataclass
class MistakeDetectionExample:
     video_id: str
     step_id: int
     frames: list[Image]
     action_description: str
     mistake: bool
     mistake_type: Optional[str] = None
     mistake_description: Optional[str] = None


Gather data:

In [8]:
import os, json
from pprint import pprint
from tqdm import tqdm
from PIL import Image

# Pick a sample video from CaptainCook4D
all_video_files = os.listdir(VIDEO_DIR)
video_paths = [f for f in all_video_files if f.endswith('.mp4')]
STEP_ANNOTATIONS = json.load(open(os.path.join(ANNOTATIONS_DIR, "annotation_json/complete_step_annotations.json"), "r"))
ERROR_ANNOTATIONS = json.load(open(os.path.join(ANNOTATIONS_DIR, "annotation_json/error_annotations.json"), "r"))
for error_annotation in ERROR_ANNOTATIONS:
    video_id = error_annotation['recording_id']
    STEP_ANNOTATIONS[video_id]["steps_errors"] = error_annotation["step_annotations"]

success_examples = []
error_examples = []
for sample_video_path in tqdm(video_paths):
    sample_video_id = "_".join(sample_video_path.split('_')[:2])
    sample_video_path = os.path.join(VIDEO_DIR, sample_video_path)
    try:
        sample_video = get_video(sample_video_path)
    except:
        print(f"Warning: could not open video file: {sample_video_path}")
        continue

    # Load step annotations for it and display precondition/effect frames
    for step in STEP_ANNOTATIONS[sample_video_id]["steps_errors"]:
        # Extract some keyframes for the action
        step_duration = step['end_time'] - step['start_time']
        step_id = int(step['step_id'])
        
        # Some steps are skipped
        if step_duration < 0.1:
            continue

        adjusted_start = step['start_time'] + min(step_duration * 0.05, 0.5) # Adjust the start time to be later by a maximum of 0.5 seconds
        adjusted_end = step['end_time'] - min(step_duration * 0.3, 3) # Adjust the end time to be earlier by a maximum of 3 seconds
        SAMPLE_FREQUENCY = 4.0
        times = generate_float_series(adjusted_start, adjusted_end, SAMPLE_FREQUENCY) # ultimately, we'll want to look at every image frame in some regular interval to determine if there's a mistake
        frames = extract_frames(sample_video, times)
        frames = [Image.fromarray(frame) for frame in frames]

        verb, action_description = step['description'].split("-")[0], "-".join(step['description'].split("-")[1:])
        
        if "errors" in step and len(step["errors"]) > 0:               
            mistake_type = step['errors'][0]["tag"]
            mistake_description = step['errors'][0]['description']
            # altered_action_description = step['modified_description'] # NOTE: can use this later if needed
            
            # Start with only errors specific to a single step, not related to quantities
            # Preparation error involves the wrong object(s)
            # Technique error involves action being performed the wrong way
            if mistake_type not in ["Preparation Error", "Technique Error"]:
                continue
            
            if len(step['errors']) > 1:
                print("Warning: Some error information discarded from only using the first annotated error.")            
            
            error_examples.append(
                MistakeDetectionExample(
                    sample_video_id,
                    step_id,
                    frames,
                    action_description,
                    True,
                    mistake_type,
                    mistake_description
                )
            )
            # pprint(error_examples[-1])
        else:
            success_examples.append(
                MistakeDetectionExample(
                    sample_video_id,
                    step_id,
                    frames,
                    action_description,
                    False
                )
            )        
            # pprint(success_examples[-1])

    if len(error_examples) >= 20 and len(success_examples) >= 20:
        print("Collected at least 20 positive and negative examples!")
        break
    else:
        print("Error examples:", len(error_examples))
        print("Success examples:", len(success_examples))

    sample_video.release()

  0%|          | 1/335 [00:01<05:41,  1.02s/it]

Error examples: 1
Success examples: 6


  1%|          | 2/335 [00:01<04:42,  1.18it/s]

Error examples: 3
Success examples: 11


  1%|          | 3/335 [00:02<04:57,  1.12it/s]

Error examples: 8
Success examples: 13


  1%|          | 4/335 [00:05<08:12,  1.49s/it]

Error examples: 8
Success examples: 24


  1%|▏         | 5/335 [00:05<06:00,  1.09s/it]

Error examples: 11
Success examples: 24


  2%|▏         | 6/335 [00:06<05:07,  1.07it/s]

Error examples: 12
Success examples: 31


  2%|▏         | 7/335 [00:07<05:15,  1.04it/s]

Error examples: 17
Success examples: 35


  2%|▏         | 8/335 [00:08<05:16,  1.03it/s]

Error examples: 17
Success examples: 41


  2%|▏         | 8/335 [00:09<06:23,  1.17s/it]

Collected at least 20 positive and negative examples!





## Model setup

## Step 1: VQG with LLaMA

Load model:

In [15]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

LM_NAME = "meta-llama/Llama-2-7b-hf"
model = pipeline("text-generation", model=LM_NAME, token="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", model_kwargs= {"load_in_8bit": True})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Load recipe steps:

In [10]:
from pprint import pprint
import json

RECIPE_STEPS = json.load(open(os.path.join(ANNOTATIONS_DIR, "annotation_json/step_idx_description.json"), "r"))
RECIPE_STEPS = {int(k): "-".join(v.split("-")[1:]).strip() for k, v in RECIPE_STEPS.items()}

pprint(RECIPE_STEPS)

{1: 'Pour 1 egg into the ramekin cup',
 2: 'Place the egg from the cup over the lettuce',
 3: 'Coat a 6-oz. ramekin cup with cooking spray',
 4: 'Microwave the ramekin cup uncovered on high for 30 seconds',
 5: 'sprinkle 1 tablespoon of cheese on cup',
 6: 'Top cup with 1 tablespoon of salsa',
 7: 'replace the top of the English muffin',
 8: 'Continue to Microwave for 15-30 more seconds or until the egg is almost '
    'set',
 9: 'Line the bottom piece of the English muffin with lettuce',
 10: 'Microwave just until cheese melts, about 10 seconds',
 11: 'stir the ramekin cup',
 12: 'Cut the English muffin into two pieces with a knife',
 14: 'Peel 1 garlic clove',
 15: 'Pour the sauces over the meatballs',
 16: 'Cut 1/8 garlic clove',
 17: 'Peel one medium onion',
 18: 'Stir the contents in the microwave with a spoon',
 19: 'Slice 1/8 medium onion',
 20: 'Microwave the plate, covered, on high for 1.5 minutes',
 21: 'Place 5 meatballs in a Microwave-safe plate',
 22: 'Cut 1/4 medium carro

Parse noun phrases from steps:

In [13]:
import spacy
from pprint import pprint

nlp = spacy.load("en_core_web_sm")

RECIPE_STEP_OBJECTS = {}
for step_id, step in RECIPE_STEPS.items():
    # Process the sentence
    doc = nlp(step)

    # Extract noun phrases
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    noun_phrases = [np for np in noun_phrases if np.lower() not in ["you", "this step", "it", "about 30 seconds", "that"]]
    
    # TODO: This could be improved further, e.g., using an LM - a lot of noise in this
    
    RECIPE_STEP_OBJECTS[step_id] = noun_phrases
    
pprint(RECIPE_STEP_OBJECTS)

{1: ['1 egg', 'the ramekin cup'],
 2: ['the egg', 'the cup', 'the lettuce'],
 3: ['a 6-oz. ramekin cup', 'cooking spray'],
 4: ['the ramekin cup', '30 seconds'],
 5: ['1 tablespoon', 'cheese', 'cup'],
 6: ['Top cup', '1 tablespoon', 'salsa'],
 7: ['the top', 'the English muffin'],
 8: ['15-30 more seconds', 'the egg'],
 9: ['the bottom piece', 'the English muffin', 'lettuce'],
 10: ['Microwave', 'cheese melts'],
 11: ['the ramekin cup'],
 12: ['the English muffin', 'two pieces', 'a knife'],
 14: ['Peel 1 garlic clove'],
 15: ['the sauces', 'the meatballs'],
 16: ['1/8 garlic clove'],
 17: ['one medium onion'],
 18: ['the contents', 'the microwave', 'a spoon'],
 19: ['Slice 1/8 medium onion'],
 20: ['the plate', '1.5 minutes'],
 21: ['Place', '5 meatballs', 'a Microwave-safe plate'],
 22: ['Cut 1/4 medium carrot', 'short, thin strips'],
 23: ['Mix 1/4 cup sweet-and-sour sauce',
      '1/2 teaspoon soy sauce',
      'a small bowl'],
 24: ['the plate', 'the carrots', 'onion', 'garlic', '1

Generate success verification questions:

In [21]:
import torch

example1 = 'The recipe step is "Spoon the mixture from the bowl onto the bread". To visually verify that this step is complete, what are some questions we could ask about an image of the bread and their expected answers?\n' \
           '1. Is there mixture on the bread? Yes\n' \
           '2. Is there any bread without any mixture on top of it? No' \

example2 = 'The recipe step is "Roll the tortilla into a thin, log shape about 1 inch thick. Make sure no filling leaks out.". To visually verify that this step is complete, what are some questions we could ask about an image of the tortilla and their expected answers?\n' \
           '1. Is the tortilla in a thin log shape? Yes\n' \
           '2. Is there any filling leaking out of the tortilla? No'

example3 = 'The recipe step is "Fold the coffee filter into quarters". To visually verify that this step is complete, what are some questions we could ask about an image of the coffee filter and their expected answers?\n' \
           '1. Is the coffee filter in a quarter circle? Yes\n' \
           '2. Is the coffee filter folded? Yes' \

vqg_outputs = {}
with torch.no_grad():
    # TODO: make this more efficient later
    for step_id, step in tqdm(RECIPE_STEPS.items()):
        vqg_outputs[step_id] = {}
        for noun_phrase in RECIPE_STEP_OBJECTS[step_id]:
        
            test = f'The recipe step is "{step}". To visually verify that this step is complete, what are some questions we could ask about an image of {noun_phrase} and their expected answers?\n'

            prompt = "\n\n".join([example1, example2, example3, test])

            text = model(prompt, 
                         max_new_tokens=256, 
                         do_sample=False)[0]['generated_text']
            text = text.replace(example1, "").replace(example2, "").replace(example3, "").strip()
            questions_answers = [(q_a.split("?")[0].strip() + "?", q_a.split("?")[1].strip()) for q_a in text.split("\n")[1:3]] # NOTE: only extract k=2 questions and answers; can adjust this as needed later

            print("===========================================================================")
            print(step)
            pprint(questions_answers)

            vqg_outputs[step_id][noun_phrase] = questions_answers

  0%|          | 0/350 [00:00<?, ?it/s]

Coat a 6-oz. ramekin cup with cooking spray
[('1. Is the ramekin cup coated with cooking spray?', 'Yes'),
 ('2. Is the ramekin cup 6 ounces?', 'Yes')]


  0%|          | 1/350 [01:14<7:12:55, 74.43s/it]

Coat a 6-oz. ramekin cup with cooking spray
[('1. Is the ramekin cup coated with cooking spray?', 'Yes'),
 ('2. Is the ramekin cup coated with cooking spray?', 'Yes')]
Pour 1 egg into the ramekin cup
[('1. Is there an egg in the ramekin cup?', 'Yes'),
 ('2. Is the egg whole?', 'Yes')]


  0%|          | 1/350 [02:12<12:53:31, 132.98s/it]


KeyboardInterrupt: 

In [None]:
import json
json.dump(vqg_outputs, open("cache_dir/vqg_outputs.json","w"))

## Step 2: VQA with LLaVA

Load model:

In [None]:
# Setup code grabbed from docs: https://huggingface.co/docs/transformers/model_doc/llava#transformers.LlavaForConditionalGeneration
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = LlavaForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR, load_in_8bit=True)

Ask success verification questions per frame:

In [None]:
import random
from pprint import pprint
import torch

examples = success_examples[:20] + error_examples[:20]
examples = examples[:1] # Just for debug purposes

prompt_template = "USER: <image>\n{question} (yes/no) ASSISTANT: "

vqa_outputs = []
with torch.no_grad():
    for example in examples:
        this_vqa_outputs = []
        
        questions_answers = [QA for noun_phrase in vqg_outputs[step_idx] for QA in vqg_outputs[step_idx][noun_phrase]]
        prompts = [prompt_template.format(question=question) for question, _ in questions_answers]
        expected_answers = [answer for _, answer in questions_answers]
                           
        # TODO: make more efficient for full evaluation; will need to mess around with padding, ensure padding token is on correct side
        for frame in example.frames:
            for prompt, expected_answer in zip(prompts, expected_answers):
                inputs = processor(text=prompt, images=frame, return_tensors="pt").to(device)

                # Generate
                logits = model(**inputs).logits[0] # (seq length, vocab size)
                no_logit = logits[-1, NO_ID]
                yes_logit = logits[-1, YES_ID]
                probs = torch.softmax(torch.stack((no_logit, yes_logit), dim=0), dim=0).detach().cpu()
                
                if probs[0] <= 0.5:
                    pred = "No"
                else:
                    pred = "Yes"                
                
                this_vqa_outputs.append((frame, prompt, probs, pred, expected_answer))
                
        vqa_outputs.append(this_vqa_outputs)

## Step 3: Evaluate VQA Outputs

In [None]:
for example, outputs in zip(examples, vqa_outputs):
    mistake_predictions = []
    for frame, prompt, probs, pred, expected_answer in outputs:
        if pred != expected_answer:
            predicted_mistake = True
        else:
            predicted_mistake = False
        mistake_predictions.append(predicted_mistake)
                
    pprint(mistake_predictions)