SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

SPDX-License-Identifier: Apache-2.0

In [22]:
import subprocess
import os
import json

###############################################################
# CELL 1: Configure Docker to use the disk with MOST free space
#
# What this does:
#   1. Reads `df -h` to get all mounted filesystems
#   2. Filters out things like tmpfs, /run, /dev, etc.
#   3. Picks the mountpoint with the largest free space (in GB)
#   4. Writes /etc/docker/daemon.json with:
#        { "data-root": "<best_mount>/docker" }
#   5. Stops Docker, deletes old /var/lib/docker (destructive),
#      creates new <best_mount>/docker, restarts Docker
#   6. Prints Docker Root Dir for verification
###############################################################

def get_mounts_with_free_space():
    """
    Parse `df -h` and return a list of (mountpoint, free_gb).
    We skip non-interesting mountpoints like /run, /dev, tmpfs, etc.
    """
    # --output=target,avail so we get mountpoint + free human-readable
    out = subprocess.check_output("df -h --output=target,avail | tail -n +2", shell=True).decode().strip().splitlines()
    mounts = []
    for line in out:
        line = line.strip()
        if not line:
            continue
        try:
            mount, avail = line.split()
        except ValueError:
            continue

        # Filter out virtual / irrelevant mounts
        if any(mount.startswith(p) for p in ["/run", "/sys", "/proc", "/dev", "/snap"]):
            continue

        # Convert avail string to GB (handles G or M)
        if avail.endswith("G"):
            free_gb = float(avail[:-1])
        elif avail.endswith("M"):
            free_gb = float(avail[:-1]) / 1024.0
        else:
            # Skip weird units (T etc.) for simplicity
            continue

        mounts.append((mount, free_gb))

    if not mounts:
        raise RuntimeError("No suitable mounts found from df output.")
    return mounts

# 1. Discover best disk
mounts = get_mounts_with_free_space()
best_mount, best_free = sorted(mounts, key=lambda x: x[1], reverse=True)[0]

print(f"üì¶ Disk with largest free space: {best_mount} ({best_free:.2f} GB free)")

docker_root = os.path.join(best_mount, "docker")
print("‚û°Ô∏è Docker will use data-root:", docker_root)

# 2. Prepare daemon.json
daemon_cfg = {"data-root": docker_root}
tmp_daemon = "/tmp/daemon.json.tmp"

with open(tmp_daemon, "w") as f:
    json.dump(daemon_cfg, f, indent=2)

print("üìù Writing /etc/docker/daemon.json with:", daemon_cfg)
subprocess.run(f"sudo mv {tmp_daemon} /etc/docker/daemon.json", shell=True, check=False)

# 3. Stop Docker + socket
print(" Restart Docker services...")
subprocess.run("sudo systemctl restart docker", shell=True)

time.sleep(5)

# Verify new storage location
print(subprocess.run("docker info | grep 'Docker Root Dir'", shell=True, capture_output=True, text=True).stdout)


üì¶ Disk with largest free space: /ephemeral (734.00 GB free)
‚û°Ô∏è Docker will use data-root: /ephemeral/docker
üìù Writing /etc/docker/daemon.json with: {'data-root': '/ephemeral/docker'}
 Restart Docker services...
 Docker Root Dir: /ephemeral/docker



In [6]:
##############################################################
# CONFIG + IMPORTS
#
# This section imports required libraries and declares all the
# key URLs and paths needed to talk to the VST and Alert Bridge.
#
# You only need to edit HOST, STORAGE_HTTP_PORT, and your 
# alert media base directory if your deployment differs.
##############################################################

import os
import time
import json
import uuid
from datetime import datetime, timezone
import requests

# -----------------------------------------------
# HOST where your docker-compose stack is running.
# If Jupyter is on the same machine ‚Üí use localhost.
# -----------------------------------------------
HOST = "localhost"

# ---------------------------------------------------------------
# PORT for VST's "storage-ms" microservice (video registration).
# This MUST match STORAGE_HTTP_PORT in your .env file.
# Default in NVIDIA examples is usually 32000.
# ---------------------------------------------------------------
STORAGE_HTTP_PORT = 30000
VST_BASE_URL = f"http://{HOST}:{STORAGE_HTTP_PORT}"

# ---------------------------------------------------------------
# PORT for Alert Bridge.
# Alert Bridge exposes a FastAPI server, default port: 9080.
# ---------------------------------------------------------------
ALERT_BRIDGE_PORT = 9080
ALERT_BRIDGE_BASE_URL = f"http://{HOST}:{ALERT_BRIDGE_PORT}"

# ---------------------------------------------------------------
# This directory MUST MATCH the docker compose variable:
#   ALERT_REVIEW_MEDIA_BASE_DIR
# Event Reviewer resolves relative video paths using this folder.
#
# If you did:
#   ALERT_REVIEW_MEDIA_BASE_DIR=/tmp/alert-media-dir docker compose up
#
# Then set it here exactly the same.
# ---------------------------------------------------------------
ALERT_REVIEW_MEDIA_BASE_DIR = "/tmp/alert-media-dir"

# ---------------------------------------------------------------
# We place user videos under a subfolder inside ALERT_REVIEW_MEDIA_BASE_DIR.
# VST receives only *relative paths*, so having a subfolder like "media/"
# is clean and aligns with official examples.
# ---------------------------------------------------------------
LOCAL_MEDIA_SUBDIR = "media"
os.makedirs(os.path.join(ALERT_REVIEW_MEDIA_BASE_DIR, LOCAL_MEDIA_SUBDIR), exist_ok=True)

print("VST_BASE_URL:", VST_BASE_URL)
print("ALERT_BRIDGE_BASE_URL:", ALERT_BRIDGE_BASE_URL)
print("ALERT_REVIEW_MEDIA_BASE_DIR:", ALERT_REVIEW_MEDIA_BASE_DIR)


VST_BASE_URL: http://localhost:30000
ALERT_BRIDGE_BASE_URL: http://localhost:9080
ALERT_REVIEW_MEDIA_BASE_DIR: /tmp/alert-media-dir


In [12]:
################################################################
# COPY LOCAL VIDEO FILE INTO THE DIRECTORY THAT ALERT BRIDGE 
# AND VST EXPECT.
#
# Event Reviewer does NOT fetch files from arbitrary paths.
# The file MUST be under ALERT_REVIEW_MEDIA_BASE_DIR.
#
# You provide the local video file path below.
################################################################

import shutil

# ---------------------------------------------------------------
# EDIT THIS: Full path to any local video on your machine.
# Example: "/home/ubuntu/videos/forklift_incident.mp4"
# ---------------------------------------------------------------
SOURCE_VIDEO_PATH = "./assets/warehouse_safety_video_short_ppe.mp4"
# ./assets/warehouse_safety_video_short_ppe.mp4
# ./assets/warehouse_safety_video_short_no_ppe.mp4
if not os.path.exists(SOURCE_VIDEO_PATH):
    raise FileNotFoundError(f"Source video not found: {SOURCE_VIDEO_PATH}")

# The filename only (e.g., demo.mp4)
target_filename = os.path.basename(SOURCE_VIDEO_PATH)

# This is the relative path VST + Alert Bridge will use.
relative_media_path = f"{LOCAL_MEDIA_SUBDIR}/{target_filename}"

# Build absolute target path inside the alert directory:
dest_path = os.path.join(ALERT_REVIEW_MEDIA_BASE_DIR, relative_media_path)

print("Copying local video ‚Üí alert media directory:")
print("  Source:", SOURCE_VIDEO_PATH)
print("  Target:", dest_path)

shutil.copy2(SOURCE_VIDEO_PATH, dest_path)

print("\n‚úÖ File copied successfully.")
print("Relative path that VST expects:", relative_media_path)


Copying local video ‚Üí alert media directory:
  Source: ./assets/warehouse_safety_video_short_ppe.mp4
  Target: /tmp/alert-media-dir/media/warehouse_safety_video_short_ppe.mp4

‚úÖ File copied successfully.
Relative path that VST expects: media/warehouse_safety_video_short_ppe.mp4


In [13]:
################################################################
# REGISTER THE VIDEO WITH VST (Video Storage Toolkit)
#
# This version matches the example in NVIDIA docs that uses
# form-encoded data, not JSON. The key detail:
#   - we send "metadata" as json.dumps(metadata)
#   - we use `data=form_data` instead of `json=...`
################################################################

import json
import time

metadata = {
    "sensorId": "camera-001",
    "timestamp": int(time.time()),      # epoch seconds
    "eventInfo": "manual_upload_demo",
    "streamName": "manual_demo_stream"
}

# NOTE:
# - mediaFilePath is the RELATIVE path under ALERT_REVIEW_MEDIA_BASE_DIR
# - metaDataFilePath can be "" if you don't have a separate CV metadata file
form_data = {
    "mediaFilePath": relative_media_path,   # e.g. "media/warehouse_safety_video_short_no_ppe.mp4"
    "metaDataFilePath": "",                 # no separate CV metadata file
    "metadata": json.dumps(metadata),       # IMPORTANT: JSON STRING, not object
}

print("Sending payload to VST (as form data):")
print(json.dumps({"mediaFilePath": form_data["mediaFilePath"],
                  "metaDataFilePath": form_data["metaDataFilePath"],
                  "metadata": metadata}, indent=2))

resp = requests.post(f"{VST_BASE_URL}/api/v1/storage/file", data=form_data)
print("\nStatus:", resp.status_code)
print("Response:", resp.text)

resp.raise_for_status()
vst_response = resp.json()
vst_id = vst_response["id"]

print("\n‚úÖ VST registration complete.")
print("VST ID:", vst_id)
print("VST filePath:", vst_response.get("filePath"))



Sending payload to VST (as form data):
{
  "mediaFilePath": "media/warehouse_safety_video_short_ppe.mp4",
  "metaDataFilePath": "",
  "metadata": {
    "sensorId": "camera-001",
    "timestamp": 1765396539,
    "eventInfo": "manual_upload_demo",
    "streamName": "manual_demo_stream"
  }
}

Status: 200
Response: {
	"bytes" : 8588407,
	"created_at" : "2025-12-10T19:55:40.369Z",
	"filePath" : "media/warehouse_safety_video_short_ppe.mp4",
	"filename" : "warehouse_safety_video_short_ppe",
	"id" : "976250dc-4144-4c17-b8a6-9c9e603592d1",
	"sensorId" : "camera-001"
}

‚úÖ VST registration complete.
VST ID: 976250dc-4144-4c17-b8a6-9c9e603592d1
VST filePath: media/warehouse_safety_video_short_ppe.mp4


In [17]:
vst_id

'976250dc-4144-4c17-b8a6-9c9e603592d1'

In [20]:
################################################################
# SEND ALERT TO ALERT BRIDGE
#
# This is the main step. You build an alert JSON payload and send:
#
#   POST /api/v1/alerts
#
# This triggers:
#   1. VLM Reasoning (Cosmos-Reason1-7B) inside Event Reviewer
#   2. Event summary generation
#   3. Display in the Alert Inspector UI
#
# Fields to know:
#   - id ‚Üí unique alert ID
#   - video_path ‚Üí ABSOLUTE PATH to the file
#   - vst_id ‚Üí links the alert to VST‚Äôs stored clip
#   - start_time / end_time ‚Üí seconds in the clip (use full clip)
#   - vss_params ‚Üí how VLM should analyze the clip
################################################################

from pprint import pprint

alert_id = str(uuid.uuid4())  # unique per alert

# ISO timestamp in UTC per spec
timestamp_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

# Absolute file path required by Alert Bridge
# video_abs_path = os.path.join(ALERT_REVIEW_MEDIA_BASE_DIR, relative_media_path)

# Use RELATIVE path for video_path; backend will join it with ALERT_REVIEW_MEDIA_BASE_DIR
video_abs_path = relative_media_path  # e.g. "media/warehouse_safety_video_short_no_ppe.mp4"


# ---------------------------
# CONFIGURE VLM (Cosmos) CALL
# ---------------------------
vlm_params = {
    "prompt": "Is the person on the ladder wearing a hardhat and safety vest?",
    "system_prompt": "You are a warehouse monitoring system tasked with identifying safety events in a warehouse. Answer the user's question accurately based on the input video.",
    "max_tokens": 128,
    "temperature": 0.3,
    "top_p": 0.3,
    "top_k": 40,
    "seed": 42
}

# VSS pipeline options
vss_params = {
    "vlm_params": vlm_params,
    "cv_metadata_overlay": False,   # No CV pipeline
    "enable_reasoning": False,      # No chain-of-thought, keep short
    "do_verification": True,
    "debug": True
}

# ---------------------------
# BUILD ALERT PAYLOAD
# ---------------------------
alert_payload = {
    "id": alert_id,
    "version": "1.0",
    "@timestamp": timestamp_iso,
    "sensor_id": "manual_demo_sensor",
    "video_path": video_abs_path,
    "vst_id": vst_id,
    "start_time": "0.0",       # full clip
    "end_time": "9999.0",      # full clip
    "alert": {
        "severity": "MEDIUM",
        "status": "REVIEW_PENDING",
        "type": "manual_demo_alert",
        "description": "Manual alert triggered from Jupyter."
    },
    "event": {
        "type": "video_analysis",
        "description": "Manual demo event"
    },
    "confidence": 1.0,
    "cv_metadata_path": "",  # Not used
    "vss_params": vss_params,
    "meta_labels": [
        {"key": "prompt_index", "value": "0"},
        {"key": "prompt_text", "value": vlm_params["prompt"]},
        {"key": "enable_reasoning", "value": "False"}
    ]
}

print("Sending alert to Alert Bridge with payload:")
pprint(alert_payload)

resp = requests.post(f"{ALERT_BRIDGE_BASE_URL}/api/v1/alerts", json=alert_payload)
print("\nStatus:", resp.status_code)
print("Response:", resp.text)
resp.raise_for_status()

print("\n‚úÖ Alert successfully submitted.")
print("Alert ID:", alert_id)


Sending alert to Alert Bridge with payload:
{'@timestamp': '2025-12-10T20:04:14Z',
 'alert': {'description': 'Manual alert triggered from Jupyter.',
           'severity': 'MEDIUM',
           'status': 'REVIEW_PENDING',
           'type': 'manual_demo_alert'},
 'confidence': 1.0,
 'cv_metadata_path': '',
 'end_time': '9999.0',
 'event': {'description': 'Manual demo event', 'type': 'video_analysis'},
 'id': 'd5a518bb-70f9-416d-9e03-2700ad53b108',
 'meta_labels': [{'key': 'prompt_index', 'value': '0'},
                 {'key': 'prompt_text',
                  'value': 'Is the person on the ladder wearing a hardhat and '
                           'safety vest?'},
                 {'key': 'enable_reasoning', 'value': 'False'}],
 'sensor_id': 'manual_demo_sensor',
 'start_time': '0.0',
 'version': '1.0',
 'video_path': 'media/warehouse_safety_video_short_ppe.mp4',
 'vss_params': {'cv_metadata_overlay': False,
                'debug': True,
                'do_verification': True,
        

# Part 3: VSS Event Review

## Event Review 

In this notebook we'll explore the [NVIDIA AI blueprint for video search and summarization (VSS)](https://build.nvidia.com/nvidia/video-search-and-summarization/blueprintcard) Event Review feature introduced in VSS 2.4 which allows VSS to act as an intelligent add on to any Computer Vision Pipeline by allowing for direct VLM access. 

### Learning Objectives:
This notebook explores the following topics:
* VSS REST APIs for direct VLM Access 
* Use VSS to generate video captions 
* Analyze short video clips for summaries, Q&A and Alerts 


In [1]:
vss_url = "http://localhost:8100"
warehouse_safety_video = "assets/warehouse_safety_video.mp4"
warehouse_safety_video_short_ppe = "assets/warehouse_safety_video_short_ppe.mp4"
warehouse_safety_video_short_no_ppe = "assets/warehouse_safety_video_short_no_ppe.mp4"

In [2]:
health_endpoint = vss_url + "/health/ready" #check the status of the VSS server
upload_file_endpoint = vss_url + "/files" #upload and manage files
summarize_endpoint = vss_url + "/summarize" #summarize uploaded content
qna_endpoint = vss_url + "/chat/completions" #ask questions for ingested video
review_alert_endpoint = vss_url + "/reviewAlert"
generate_vlm_captions_endpoint = vss_url + "/generate_vlm_captions"

In [None]:
import sys 
python_exe = sys.executable
!{python_exe} -m ensurepip --upgrade
!{python_exe} -m pip install -r requirements.txt 

In [4]:
#helper function to verify responses 
import json
import requests
from IPython.display import Markdown, display

def check_response(response, text=False):
    print(f"Response Code: {response.status_code}")
    if response.status_code == 200:
        print("Response Status: Success")
        if text:
            print(response.text)
            return response.text
        else:
            print(json.dumps(response.json(), indent=4))
            return response.json()
    else:
        print("Response Status: Error")
        print(response.text)
        return None 

Let's use the health endpoint to verify your VSS instance is running. **Make sure the following cell outputs "Response Code: 200" before proceeding.**

In [None]:
resp = requests.get(health_endpoint)
resp = check_response(resp, text=True)

Then lets save the configured VLM model so we can use it in future requests. 

In [None]:
try:
    resp = requests.get(vss_url + "/models")
    resp = check_response(resp)
    configured_vlm = resp["data"][0]["id"]
except Exception as e:
    print(f'Server not ready: {e}')

In [None]:
print(f"Configured VLM: {configured_vlm}")

### Part 1: Direct VLM Access
<img alt="event reviewer apis" src="assets/event_review_apis.png" width=1000>

VSS 2.4 introduced a new Event Review feature that allows direct access to the VLM through the `/generate_vlm_captions` endpoints and a convenient `/reviewAlert` endpoint to generate boolean states and descriptions from the VLM over short video clips. 

The intention of the review alert endpoint is to provide a way to answer yes/no questions with direct VLM calls, bypassing the need for full ingestion and LLM needed when using the `/chat/completion` or `/summarize` endpoint. 

This is best suited when low latency responses are required on short clips of video. 

The `/generate_vlm_captions` endpoint allows direct access to the VLM for the purpose of generating captions across an input video and receiving the captions in the request response. This can be done for both short and long videos. It can also be used for general Q&A on video clips to the VLM. 

For this notebook we will use a warehouse ladder safety example. In this two minute video, two workers climb a green ladder in the center aisle of the warehouse. One worker wears proper PPE (Hardhat & Safety Vest) while the other does not. 

<video width="1000 " height=" " 
       src="assets/warehouse_safety_video.mp4"  
       controls>
</video>

The next cell will upload the warehouse ladder safety video to VSS. 

In [None]:
with open(warehouse_safety_video, "rb") as file:
    files = {"file": ("warehouse_safety_video.mp4", file)} #provide the file content along with a file name 
    data = {"purpose":"vision", "media_type":"video"}
    response = requests.post(upload_file_endpoint, data=data, files=files) #post file upload request 
response = check_response(response)
video_id = response["id"] #save file ID for summarization request

#### Part 1.1 VLM Captioning

The `/generate_vlm_captions` endpoint is similar to the `/summarize` endpoint however, it does not trigger the summarization or RAG pipelines that are used to generate summaries and enable Q&A on videos. The `/generate_vlm_captions` endpoint will only make calls to the VLM and directly return the VLM captions generated on the video. It will not generate summaries or store the captions into a database. 

This endpoint does not require the deployment of an LLM, embedding or reranker models, which makes it more suitable for lightweight single GPU and edge GPU deployments. 

If your application does not need to summarize or produce a database of the video captions, then you can call the `/generate_vlm_captions` endpoint to take advantage of the GPU optimized decoding and frame selection pipeline to have a high throughput VLM captioning pipeline. 

The `/generate_vlm_captions` endpoint is also suitable if you intend to build your own database connectors and agent to operate on top of the VLM generated captions. 

Using this endpoint is a great debugging tool if your summaries or Q&A through the `/summarize` and `/chat_completions` endpoint are not providing good accuracy. You can use this endpoint to easily inspect, tweak and tune your VLM prompts to ensure the VLM is producing descriptions with the relevant data for summarization and Q&A.

The following cell shows the body of the request used for the `/generate_vlm_captions` endpoint. It is nearly identical to the `/summarize` endpoint body but with fewer parameters because we are only configuring prompts and parameters for the VLM. 

In [9]:
body = {
    "id": video_id, #id of file returned after upload 
    "model": configured_vlm,
    "system_prompt": "Your task is to accurately caption an input video.", #VLM System Prompt 
    "prompt": "Write a detailed dense caption describing the events in video.", #VLM User Prompt
    "max_tokens": 512, #max tokens for VLM 
    "temperature": 0.4, #temperature for VLM 
    "top_p": 0.4, #top p for VLM 
    "chunk_duration": 20,
    "enable_reasoning": False,
    "chunk_overlap_duration": 0,
    "num_frames_per_chunk" : 20,
    "vlm_input_width": 1280,
    "vlm_input_height": 720
}

With the body defined, we can post the request to the `/generate_vlm_captions` endpoint. 

In [None]:
response = requests.post(generate_vlm_captions_endpoint, json=body)
response = check_response(response)

From the request response, we can extract the VLM caption for each chunk in the video. 

In [None]:
chunk_responses = response["chunk_responses"]
for chunk_response in chunk_responses:
    print(f"Time: {chunk_response['start_time']} - {chunk_response['end_time']}\n")
    print(f"Reasoning: {chunk_response['reasoning_description']}\n")
    print(f"Caption: {chunk_response["content"]}\n\n")


Scroll through the response and see how the VLM generates the captions over each video chunk. Try adjusting the system and user prompts to the VLM to see how the captions change.

In the next few cells, lets enable VLM reasoning and generate the captions again.

In [12]:
body = {
    "id": video_id, #id of file returned after upload 
    "model": configured_vlm,
    "system_prompt": "Your task is to accurately caption an input video.", #VLM System Prompt 
    "prompt": "Write a detailed dense caption describing the events in video.", #User Prompt for VLM 
    "max_tokens": 1024, #max tokens for VLM 
    "temperature": 0.4, #temperature for VLM 
    "top_p": 0.4, #top p for VLM 
    "chunk_duration": 20,
    "enable_reasoning": True,
    "chunk_overlap_duration": 0,
    "num_frames_per_chunk" : 20,
    "vlm_input_width": 1280,
    "vlm_input_height": 720
}

In [None]:
response = requests.post(generate_vlm_captions_endpoint, json=body)
response = check_response(response)

In [None]:
chunk_responses = response["chunk_responses"]
for chunk_response in chunk_responses:
    print(f"Time: {chunk_response['start_time']} - {chunk_response['end_time']}\n")
    print(f"Reasoning: {chunk_response['reasoning_description']}\n")
    print(f"Caption: {chunk_response["content"]}\n\n")

Notice how with reasoning enabled, the additional `reasoning_description` field is populated. This is the reasoning trace generated by the VLM prior to generating the text in the `content` field. 

This reasoning trace can help improve the VLM's ability to understand the input frames to provide more detailed captions.

#### Part 1.2 Analyze Short Clips

Another way to use the `/generate_vlm_captions` endpoint is for direct Q&A to the VLM on short video clips. 

For this warehouse safety example, it would not be very efficient to continously run a VLM to always check if someone is wearing PPE while on the ladder. If deployed, there would be a significant amount of time where no one is on the ladder that the VLM would be called on. 

A great use of VSS with direct access to the VLM is to combine it with a lightweight computer vision pipeline such as a DeepStream detection pipeline to run continuosly and detect when someone is on the ladder.

These short clips output by the computer vision pipeline can then be sent to the VLM to check for PPE and trigger an alert or notification. By combining a light weight computer vision pipeline with VSS, you can build an efficient pipeline suitable for edge deployments to gain insights that can't be answered with standard computer vision models. 

Lets take the two 30 second clips of when the person with and without PPE are present on the ladder and analyze them with VSS. 

![no ppe](assets/warehouse_ladder_no_ppe.png)
![no ppe](assets/warehouse_ladder_ppe.png)

In [None]:
with open(warehouse_safety_video_short_ppe, "rb") as file:
    files = {"file": ("warehouse_safety_video_short_ppe.mp4", file)} #provide the file content along with a file name 
    data = {"purpose":"vision", "media_type":"video"}
    response = requests.post(upload_file_endpoint, data=data, files=files) #post file upload request 
response = check_response(response)
video_id_ppe = response["id"] #save file ID for summarization request

In [None]:
with open(warehouse_safety_video_short_no_ppe, "rb") as file:
    files = {"file": ("warehouse_safety_video_short_no_ppe.mp4", file)} #provide the file content along with a file name 
    data = {"purpose":"vision", "media_type":"video"}
    response = requests.post(upload_file_endpoint, data=data, files=files) #post file upload request 
response = check_response(response)
video_id_no_ppe = response["id"] #save file ID for summarization request

We can define a simple wrapper around the /generate_vlm_captions endpoint for direct VLM Q&A to analyze the clips. 

In [17]:
def vlm_qna(video_id, question, enable_reasoning=False):
    body = {
        "id": video_id, #id of file returned after upload 
        "model": configured_vlm,
        "system_prompt": "Your task is to view the video and answer the user's question accurately based on the video.", #VLM System Prompt 
        "prompt": question, #User Prompt for VLM 
        "max_tokens": 1024, #max tokens for VLM 
        "temperature": 0.4, #temperature for VLM 
        "top_p": 0.4, #top p for VLM 
        "chunk_duration": 120,
        "enable_reasoning": enable_reasoning,
        "chunk_overlap_duration": 0,
        "num_frames_per_chunk" : 20,
        "vlm_input_width": 1280,
        "vlm_input_height": 720
    }
    response = requests.post(generate_vlm_captions_endpoint, json=body)
    response = check_response(response)
    return response["chunk_responses"][0]["content"]

In the body of the request, we will set the chunk duration to a value larger than the video length to guarntee only 1 VLM call is made for the entire video. 

Lets try some questions on both of the videos.

In [None]:
answer = vlm_qna(video_id_ppe, "Does anyone use a ladder?")
print(answer)


In [None]:
answer = vlm_qna(video_id_no_ppe, "Does anyone use a ladder?")
print(answer)

In [None]:
answer = vlm_qna(video_id_ppe, "Does the person the ladder have a hardhat and safety vest?")
print(answer)

In [None]:
answer = vlm_qna(video_id_no_ppe, "Does the person the ladder have a hardhat and safety vest?")
print(answer)

Lets try a more open ended question and see how the VLM is able to reason through it. 

In [None]:
answer = vlm_qna(video_id_ppe, "Is the person using the ladder safely?", enable_reasoning=True)
print(answer)

In [None]:
answer = vlm_qna(video_id_no_ppe, "Is the person using the ladder safely?", enable_reasoning=True)
print(answer)

For short video clips, this is a good way to do low latency Q&A. However for long video clips, accuracy may degrade because the VLM can only take in a certain number of frames at once. As the length of the input video grows, the VLM will view fewer frames of the video and may miss important events to answer the user's question. 

For this reason, Q&A on long videos is recommended to use the /summarize and /chat_completions endpoint to take advantage of the LLM and GraphRAG for long video understanding. 

### Part 2: Event Review 

In the previous section, we saw how to directly access the VLM to generate captions and perform Q&A on short video clips. 

An alternative way to directly access the VLM to analyze a video is to use the `/reviewAlert` endpoint. This endpoint provides additional parameters and return values that make it more suitable to connect with a computer vision pipeline and video management system for building a full event review workflow. 

Often times, a light weight computer vision pipeline based on detection data and hueristic based algorithms can try to determine if certain events or states in a video are True or False. For example, a computer vision pipeline can try to determine if two cars from a traffic camera have had a collision or if a person has walked through a door. However, traditional computer vision pipelines may have a high rate of false positives because detection data and heuristic based algorithms to determine if an event has occured is not able to fully understand the context of  the video. 

For this reason, a Vision Language Model can be used as a judge to evaluate the output of a computer vision pipeline to determine if a detected event should be true or false as well as provide further insights through a natural language description that cannot be derived from traditional computer vision models and pipelines. 

The endpoint uses the VLM to determine the answer to a yes/no question over a video clip and returns a boolean state that is based on the VLM's response. This boolean state can then be integrated with a notification system, dashboard or other applications to trigger alerts when the VLM determines the answer to a question is True. 




Lets look at the body of a `/reviewAlert` request. 

In [24]:
local_path = f"{video_id_ppe}/warehouse_safety_video_short_ppe.mp4" #local path of uploaded video file 

body = {
    "version": "1.0", #api version 
    "id": "3fa85f64-5717-4562-b3fc-2c963f66afa6", #unique ID of event provided by the client 
    "@timestamp": "2024-05-30T01:41:25.000Z", #timestamp of the event 
    "sensor_id": "camera-01", #ID of camera that detected the event 
    "video_path": local_path, #path to video file to inspect 
    "confidence": 0.5, #optional confidence score for the true/false state from the computer vision pipeline 
    "alert": { #optional alert metadata can be added to prioritize the alerts and define alert types 
            "severity": "CRITICAL",
            "status": "REVIEW_PENDING",
            "type": "Ladder Safety",
            "description": "Verify PPE usage on ladder"
            },
    
    "event": {"type": "Ladder Safety Alert", "description": "Worker present on ladder"},
    "vss_params":
    {
        "chunk_duration": 60,
        "num_frames_per_chunk": 10,
        "do_verification": True,
        "enable_reasoning": False,
        "vlm_params":
        {   
            "system_prompt": "You are a warehouse monitoring system tasked with identifying safety events in a warehouse. Answer the user's question accurately based on the input video.",
            "prompt": "Is the person on the ladder wearing a hardhat and safety vest?",
            "max_tokens": 1024,
            "temperature": 0.3,
            "top_p": 0.3
        }


    }
   
}

The `/reviewAlert` endpoint was built to support a local computer vision pipeline outputting short video clips. For this reason, there is a video_path parameter that accepts a local file path to access a video file to use an input to the VLM call. 

Video files uploaded through the `/files` endpoint can also be used by constructing the local path of the video file using the returned filed ID and file name. 

There are additonal parameters such as timestamp, sensor ID, alert severity, alert descriptions and more to attach extra metadata to the event. This metadata can be used by an external system to track the alerts.

The `vss_params` section should look very familiar to the /summarize and /generate_vlm_captions endpoint as it contains the standard parameters for chunk duration, VLM prompts and parameters. 

Lets submit the request and view the response.

In [None]:
response = requests.post(review_alert_endpoint, json=body)
response = check_response(response)

From the request response, we can see it returned the metadata that was provided in the request body along with a new `result` field that contains the VLM results. 

In [None]:
results = response["result"]
print(json.dumps(results, indent=4))

In the results, the description field is the response from the VLM. 
The verification_result is the boolean state determined by the VLMs response to our question. 

In this case, we asked the VLM to verify the person on the ladder is wearing PPE which it verified as true. 

Lets try this again but on the 30 second clip of the worker without PPE. 

In [27]:
local_path = f"{video_id_no_ppe}/warehouse_safety_video_short_no_ppe.mp4"
body = {
    "version": "1.0", #api version 
    "id": "3fa85f64-5717-4562-b3fc-2c963f66afa6", #unique ID of event provided by the client 
    "@timestamp": "2024-05-30T01:41:25.000Z", #timestamp of the event 
    "sensor_id": "camera-01", #ID of camera that detected the event 
    "video_path": local_path, #path to video file to inspect 
    "confidence": 0.5, #optional confidence score for the true/false state from the computer vision pipeline 
    "alert": { #optional alert metadata can be added to prioritize the alerts and define alert types 
            "severity": "CRITICAL",
            "status": "REVIEW_PENDING",
            "type": "Ladder Safety",
            "description": "Verify PPE usage on ladder"
            },
    
    "event": {"type": "Ladder Safety Alert", "description": "Worker present on ladder"},
    "vss_params":
    {
        "chunk_duration": 60,
        "num_frames_per_chunk": 10,
        "do_verification": True,
        "enable_reasoning": False,
        "vlm_params":
        {   
            "system_prompt": "You are a warehouse monitoring system tasked with identifying safety events in a warehouse. Answer the user's question accurately based on the input video.",
            "prompt": "Is the person on the ladder wearing a hardhat and safety vest?",
            "max_tokens": 1024,
            "temperature": 0.3,
            "top_p": 0.3
        }


    }
   
}

In [None]:
response = requests.post(review_alert_endpoint, json=body)
response = check_response(response)

In [None]:
results = response["result"]
print(json.dumps(results, indent=4))

With this video clip, we can see the description from the VLM does not include PPE and the verification result is False. 

For a full reference workflow combining the VSS Event Review features with a computer vision pipeline, visit the following page:
- https://docs.nvidia.com/vss/latest/content/vss_event_reviewer.html

---
### Review

In this notebook you learned the following:
- How to directly access the VLM in VSS 
- Generate VLM captions on a video 
- Analyze short video clips with direct VLM access 
- Generate boolean states and descriptions on short video clips 