In [None]:
# Copyright 2024 Reddit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Use Case 3. Video Descriptions

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/reddit/kdd2024-tutorial-breaking-barriers/blob/master/Use_Case_3_Video_Descriptions.ipynb)

## Overview

This notebook guides participants through the
steps of designing and implementing a pipeline to generate video
descriptions, combining keyframe extraction, image captioning,
audio transcript and summarization using LLMs. We will explore
the challenges and advantages for different types of video content.

---

## Setting Up Google Colab
Google Colab provides a convenient platform to run Python code in the cloud, with access to powerful computing resources, including GPUs. Again, for this tutorial, it is recommended to enable GPU acceleration:

1.   Click on *Runtime* in the top menu.
2.   Select *Change runtime type*.
3.   In the dialog that appears, under *Hardware accelerator*, choose **T4 GPU** (or any other GPU that you may have access to) if it is not already enabled.
4.   Click *Save*.

---

## Requirements


Run the following cell to install the required Python packages.

In [None]:
!pip install -U openai-whisper transformers bitsandbytes accelerate flash_attn

---

## Settings

Run the following cells to make some convenient settings.

In [None]:
# Disable Transformer warnings
import logging
logging.basicConfig(level=logging.INFO)

import transformers
transformers.logging.set_verbosity_error()

import warnings
warnings.filterwarnings('ignore')

# Set GPU device
import torch
torch.set_default_device("cuda") # or "cpu" is GPU is not available

Run the following cell to get the run time on every cell execution:

In [None]:
!pip install ipython-autotime
%load_ext autotime

Run the following cell to enable wrap when printing long strings:

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

Some other useful imports:

In [None]:
from PIL import image
from tqdm import tqdm
from glob import glob

---

## Test Videos

We will use the first video in the tutorial dataset.

In [None]:
!wget "https://raw.githubusercontent.com/reddit/kdd2024-tutorial-breaking-barriers/main/media/video1.mp4"

In [None]:
import moviepy.editor
moviepy.editor.ipython_display("video1.mp4", height=400, maxduration=300)

---

## Use Case 1. Videos with Speech

The first use case is focused on videos that have a speech track.

For these videos, the best approach is to:

1. Generate the audio transcript (with a speech-to-text LLM such as Whisper) and then

2. Use an LLM to generate a summary of the transcript.

### 1) Extract Audio Transcript

In [None]:
# Import package
import whisper

# Load model - choose the most appropriate
stt_model = whisper.load_model("base")

# Transcribe audio/video
result = stt_model.transcribe("video1.mp4")
transcript = result["text"]

# Print transcript
print(transcript)

### 2) Generate Description based on Transcript

In this case we are going to choose the same `imp-v1-3b` model that we used for image captions.

In [None]:
# Load model
from transformers import AutoModelForCausalLM, AutoTokenizer

vision_model = AutoModelForCausalLM.from_pretrained(
    "MILVLG/imp-v1-3b",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True)
vision_tokenizer = AutoTokenizer.from_pretrained(
    "MILVLG/imp-v1-3b",
    trust_remote_code=True)

In [None]:
# Text prompt
prompt = f"Write a short summary for a video that has the following transcript:\n{transcript}"

# Process text
text = vision_tokenizer.apply_chat_template(
    [{"role": "user", "content": prompt}],
    tokenize=False,
    add_generation_prompt=True
)
input_ids = vision_tokenizer(text, return_tensors="pt").input_ids
image_tensor = None

# Generate the answer
output_ids = vision_model.generate(
    input_ids,
    max_new_tokens=256,
    images=image_tensor,
    use_cache=True)[0]
description = vision_tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True)

# Display description
print(description)

#### Improved Implementation

In [None]:
def prompt(vision_tokenizer, vision_model, prompt, image = None):
  # Process text and optionally image
  text = vision_tokenizer.apply_chat_template(
      [{"role": "user", "content": prompt}],
      tokenize=False,
      add_generation_prompt=True
  )
  input_ids = vision_tokenizer(text, return_tensors="pt").input_ids
  if image:
    image_tensor = self.vision_model.image_preprocess(image)
  else:
    image_tensor = None

  # Generate the answer
  output_ids = vision_model.generate(
      input_ids,
      max_new_tokens=256,
      images=image_tensor,
      use_cache=True)[0]
  result = vision_tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True)

  return result

# Run
description = prompt(vision_tokenizer,
                     vision_model,
                     f"Write a short summary for a video that has the following transcript:\n{transcript}")
print(description)

---

## Use Case 2. Videos without Speech

In this case, the only available information are the video frames, so the approach is:

1. Extract the key frames of the video, those that best represent its contents.
2. Generate a caption for each frame using a multimodal LLM.
3. Use an LLM to generate a summary based on all the frame captions.

### 1) Extract Key Frames

We will extract the key frames using `ffmpeg` tool to the `keyframes` folder:

In [None]:
!mkdir keyframes
!ffmpeg -skip_frame nokey -i "video1.mp4" -vsync vfr "keyframes/frame-%2d.jpg" -hide_banner -loglevel error
!ls keyframes

In [None]:
all_frames = sorted(glob(f"keyframes/*jpg"))

frames_to_process = 5

frames = all_frames[0::int(len(all_frames)/frames_to_process-1)]
for frame in frames:
  display(frame, Image.open(frame))

Programmatically:

### 2) Generate Frame Captions

In [None]:
captions = []
for frame in tqdm(frames, total=len(frames)):
  image = Image.open(frame)
  caption = prompt(vision_tokenizer,
                   vision_model,
                   "<image>\nWrite a very short caption for the image with less than 20 words",
                   Image.open(frame))
  captions.append(caption)
captions

### 3) Generate Description based on Frame Captions

In [None]:
description = prompt(vision_tokenizer,
                     vision_model,
                     f"Write a short summary for a video that has the following frames:\n-{'\n-'.join(captions)}")
print(description)

---

## Complete Implementation (MLLMv2)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import whisper
from PIL import Image
import tempfile
from glob import glob
import os
from tqdm import tqdm

class MLLMv2:

  def __init__(self):
    torch.set_default_device("cuda")
    self.vision_model = AutoModelForCausalLM.from_pretrained(
      "MILVLG/imp-v1-3b",
      torch_dtype=torch.float16,
      device_map="auto",
      trust_remote_code=True)
    self.vision_tokenizer = AutoTokenizer.from_pretrained(
        "MILVLG/imp-v1-3b",
        trust_remote_code=True)
    self.stt_model = whisper.load_model("base")

  def get_image_caption(self,
                        image: Image,
                        base_prompt="Write a very short caption for the image with less than 20 words") -> str:
    return self.prompt_llm(image, base_prompt)

  def get_image_description(self,
                            image: Image,
                            base_prompt="Write a short description for the image") -> str:
    return self.prompt_llm(image_file, base_prompt)

  def get_video_transcript(self,
                           video_file: str):
    result = stt_model.transcribe(video_file)
    transcript = result["text"]
    return transcript

  def get_video_frame_captions(self,
                               video_file: str,
                               base_prompt="Write a very short caption for the image with less than 20 words",
                               frames_to_process=5):
    # Extract frames
    with tempfile.TemporaryDirectory() as tmpdirname:
      os.system(f"ffmpeg -skip_frame nokey -i '{video_file}' -vsync vfr '{tmpdirname}/frame-%2d.jpg' -hide_banner -loglevel error")
      all_frames = sorted(glob(f"{tmpdirname}/*jpg"))
      print(f"All frames: {len(all_frames)}")
      frames = all_frames[0::int(len(all_frames)/frames_to_process-1)]
      print(f"Selected frames: {len(frames)}")
    # Generate captions
    captions = []
    for frame in tqdm(frames, total=len(frames)):
      image = Image.open(frame)
      caption = self.prompt_llm(image, base_prompt)
    captions.append(caption)
    return {"all_frames": all_frames,
            "frames": frames,
            "captions": captions}

  def get_video_description(self,
                            video_file: str,
                            include_audio: bool,
                            include_frames: bool):
    transcript = None
    captions = None
    if include_audio:
      transcript = self.get_video_transcript(video_file)
    if include_frames:
      frame_captions = self.get_video_frame_captions(video_file)
      captions = frame_captions["captions"]
    if transcript and captions:
      prompt = f"Write a short summary for a video that has the following transcript and frames:\n"
      prompt += f"## Transcript:\n{transcript}"
      prompt += f"## Frames:\n-{'\n-'.join(captions)}"
    elif transcript:
      prompt = f"Write a short summary for a video that has the following transcript:\n{transcript}"
    elif captions:
      prompt = f"Write a short summary for a video that has the following frames:\n-{'\n-'.join(captions)}"
    result = self.prompt_llm(None, prompt) if prompt else None
    return {"transcript": transcript,
            "captions": captions,
            "description": result}

  def prompt_llm(self,
                 image: Image,
                 prompt: str,
                 max_new_tokens: int = 256,
                 temperature: float = 0.9,
                 top_k: int = 50,
                 top_p: float = 0.95) -> str:
    if image:
      text = self.vision_tokenizer.apply_chat_template(
          [{"role": "user", "content": f"<image>\n{prompt}"}],
          tokenize=False,
          add_generation_prompt=True
      )
      image_tensor = self.vision_model.image_preprocess(image)
    else:
      text = self.vision_tokenizer.apply_chat_template(
          [{"role": "user", "content": f"{prompt}"}],
          tokenize=False,
          add_generation_prompt=True
      )
      image_tensor = None
    input_ids = self.vision_tokenizer(text, return_tensors="pt").input_ids
    output_ids = self.vision_model.generate(
      input_ids,
      max_new_tokens=max_new_tokens,
      images=image_tensor,
      temperature=temperature,
      do_sample=True,
      top_k=top_k,
      top_p=top_p,
      use_cache=True)[0]
    response = self.vision_tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True)
    response = response.replace("\n", " ").strip().replace("  ", " ")
    return response

In [None]:
# Load model
vc = MLLMv2()

In [None]:
# Speech Transcript
vc.get_video_transcript("video1.mp4")

In [None]:
# Frame Captions
vc.get_video_frame_captions("video1.mp4")

In [None]:
# Video Description with Speech Transcript
vc.get_video_description("video1.mp4", True, False)

In [None]:
# Video Description with Frame Captions
vc.get_video_description("video1.mp4", False, True)

In [None]:
# Video Description with Speech Transcript and Frame Captions
vc.get_video_description("video1.mp4", True, True)

---

# Discussion: How Video Description with Multimodal LLMs can improve Accessibility in Social Media

- **Accessibility for all**: By making videos more accessible, multimodal LLMs can encourage greater participation and engagement from users with disabilities, fostering a more inclusive online community.
- **Automatic tagging and categorization**: LLMs can analyze the generated captions and speech transcripts to automatically assign relevant tags and categories to videos, making it easier for users with disabilities to find relevant content.
- **Multiple language support**: LLMs can be trained to generate captions and tags in multiple languages, improving accessibility for a wider range of users.
- **Speech-to-text integration**: Utilizing speech transcripts alongside video frames, multimodal LLMs can generate more accurate and detailed captions for videos, even if the audio is unclear or contains background noise.
- **Key frame captions**: For videos without speech, LLMs can analyze key frames in videos and generate descriptive captions for each, providing a textual summary for users who cannot watch the video.
- **Understanding user engagement**: By analyzing user interactions with captions and tags, LLMs can provide valuable data on the accessibility of different content formats, enabling platform owners to improve user experience.