In [1]:
json_path = "data/annotations/caption/train/20230707_8_SN46_T1/overhead_view/20230707_8_SN46_T1_caption.json"
video_root = "data/videos"

In [None]:
import os
import json
import cv2
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration, Blip2Processor, Blip2ForConditionalGeneration
import torch


In [3]:
# LLAVA Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
).to("cuda" if torch.cuda.is_available() else "cpu")

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 39.34it/s]


In [None]:
# Blip-2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor2 = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model2 = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", device_map="auto", torch_dtype=torch.float16)
model2.to(device)

In [4]:
def generate_caption(image, prompt):
    inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_new_tokens=100)
    return processor.decode(output[0], skip_special_tokens=True)

In [5]:
def extract_middle_frame(video_path, start_time, end_time):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"[ERROR] Cannot open video: {video_path}")
        return None
    
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        cap.release()
        return None
    
    middle_time = (start_time + end_time) / 2.0
    frame_number = int(middle_time * fps)
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
    ret, frame = cap.read()
    cap.release()

    if not ret:
        print(f"[ERROR] Failed to read frame at {frame_number} in video: {video_path}")
        return None
    return frame

In [6]:
def find_video_path(video_root, video_file):
    for subfolder in ["train", "val"]:
        folder_path = os.path.join(video_root, subfolder)
        for root, _, files in os.walk(folder_path):
            if video_file in files:
                return os.path.join(root, video_file)
    return None

In [7]:

def find_video(video_root: str, video_file: str):
    for dirpath, _, files in os.walk(video_root):
        if video_file in files:
            return os.path.join(dirpath, video_file)
    return None

In [8]:
def run_on_json(json_path: str, video_root: str):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    video_file = data.get("overhead_videos", [None])[0]
    if not video_file:
        print("[SKIP] No video listed in JSON.")
        return

    video_path = find_video(video_root, video_file)
    if not video_path:
        print(f"[ERROR] Could not find video file: {video_file}")
        return

    print(f"[INFO] Processing: {os.path.basename(json_path)}")
    for event in data.get("event_phase", []):
        start = float(event["start_time"])
        end = float(event["end_time"])
        labels = event.get("labels", [])

        frame = extract_middle_frame(video_path, start, end)
        if frame is None:
            continue

        caption_ped = generate_caption(frame, "<image> Describe the pedestrian behhavior at the time of the crash.")
        caption_veh = generate_caption(frame, "<image> Describe the vehicle behavior at the time of the crash.")

        print(f"\nLabels for video {video_path}")
        print(f"Labels: {labels}")
        print(f"🧍 Pedestrian: {caption_ped}")
        print(f"🚗 Vehicle: {caption_veh}")

In [None]:
#Blip-2
run_on_json(json_path, video_root)

[INFO] Processing: 20230707_8_SN46_T1_caption.json

Labels for video data/videos/train/20230707_8_SN46_T1/overhead_view/20230707_8_SN46_T1_Camera1_0.mp4
Labels: ['4']
🧍 Pedestrian: a pedestrian is a person who is walking or riding a bicycle
🚗 Vehicle: Describe the vehicle's speed, direction, and speed at the time of the crash

Labels for video data/videos/train/20230707_8_SN46_T1/overhead_view/20230707_8_SN46_T1_Camera1_0.mp4
Labels: ['3']
🧍 Pedestrian: a pedestrian is a person who is walking or riding a bicycle
🚗 Vehicle: Describe the vehicle's speed, direction, and speed at the time of the crash

Labels for video data/videos/train/20230707_8_SN46_T1/overhead_view/20230707_8_SN46_T1_Camera1_0.mp4
Labels: ['2']
🧍 Pedestrian: Describe the pedestrian's behavior at the time of the crash
🚗 Vehicle: Describe the vehicle's speed, direction, and speed at the time of the crash

Labels for video data/videos/train/20230707_8_SN46_T1/overhead_view/20230707_8_SN46_T1_Camera1_0.mp4
Labels: ['1']
🧍 

In [9]:
#Llava
run_on_json(json_path, video_root)

[INFO] Processing: 20230707_8_SN46_T1_caption.json

Labels for video data/videos/train/20230707_8_SN46_T1/overhead_view/20230707_8_SN46_T1_Camera1_0.mp4
Labels: ['4']
🧍 Pedestrian:  Describe the pedestrian behhavior at the time of the crash.

In the image, there are several cars and a truck on the road, and a dog is also present. The dog is located near the center of the scene, and it appears to be walking or standing on the road. The cars and truck are positioned around the dog, with some cars closer to the dog and others further away. The scene suggests that the dog might be wandering onto the road, which could potentially lead to an accident if drivers are not cautious
🚗 Vehicle:  Describe the vehicle behavior at the time of the crash.

The car is stopped at the intersection, waiting for the traffic light to change.

Labels for video data/videos/train/20230707_8_SN46_T1/overhead_view/20230707_8_SN46_T1_Camera1_0.mp4
Labels: ['3']
🧍 Pedestrian:  Describe the pedestrian behhavior at t