## Setup

In [2]:
video_path = "/home/naveenu/1_224p.mkv"
output_path = "/home/naveenu/frames"
part_path = "/home/naveenu/parts"

## Model

In [3]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct-AWQ",
    torch_dtype=torch.float16,
    attn_implementation="flash_attention_2",
    device_map="auto"
)

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct-AWQ")

  from .autonotebook import tqdm as notebook_tqdm
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.09s/it]


In [4]:
import os
from math import ceil

def get_frame_list(output_path, fraction=0.1):
    # Get all jpg files in the directory
    all_frames = sorted([f for f in os.listdir(output_path) if f.endswith('.jpg')])

    total_frames = len(all_frames)
    frames_to_keep = ceil(total_frames * fraction)

    # Calculate the step size to evenly distribute the selected frames
    step = total_frames // frames_to_keep

    # Select the frames
    selected_frames = all_frames[::step][:frames_to_keep]

    # Create the full paths for the selected frames
    frame_paths = [f"file://{os.path.join(output_path, frame)}" for frame in selected_frames]

    return frame_paths

In [5]:
def query_video(prompt, use_frames=True, frames_path=output_path, video_path=None):
    if use_frames:
        # Get the frames
        selected_frames = get_frame_list(output_path)

        # Create messages structure for frames
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": selected_frames,
                        "fps": 1.0,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]
    else:
        # Create messages structure for the entire video
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": f"file://{video_path}",
                        "max_pixels": 360 * 420,
                        "fps": 1.0,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]

    print(f"Using {'frames' if use_frames else 'entire video'} for inference.")

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    if not use_frames:
      print(f"Video input shape: {video_inputs[0].shape}")
      num_frames, _, resized_height, resized_width = video_inputs[0].shape
      print(f"# of video tokens: {int(num_frames / 2 * resized_height / 28 * resized_width / 28)}")
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Inference
    with torch.no_grad():  # Use no_grad to save memory during inference
        generated_ids = model.generate(**inputs, max_new_tokens=128)

    # Trim the generated output to remove the input prompt
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]

    # Decode the generated text
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    print(output_text)
    torch.cuda.empty_cache()

## Data

In [6]:
### Generating

In [7]:
# !echo "Path has $(ls {output_path} | wc -l) frames before cleanup"
# ! rm -rf {output_path}
# !mkdir -p {output_path}
# !~/.conda/envs/eecs545/bin/ffmpeg -i {video_path} -loglevel -8 -r 1 -s 360x420 -q:v 2 -start_number 0 {output_path}/'%05d.jpg'

# !echo "Generated $(ls {output_path} | wc -l) frames"

In [10]:
# !echo "Path has $(ls {part_path} | wc -l) parts before cleanup"
!rm -rf {part_path}
!mkdir -p {part_path}
!~/.conda/envs/eecs545/bin/ffmpeg -i {video_path} -loglevel -8  -c copy -map 0 -segment_time 00:05:00 -f segment -reset_timestamps 1 {part_path}/'%03d.mkv'

!echo "Generated $(ls {part_path} | wc -l) parts"

Generated 9 parts


## Inference

### Using Frames

In [8]:
# query_video("describe the video in detail",
#             use_frames=True, video_path=f"{part_path}/000.mp4")

In [9]:
# query_video("What is the outfit's color of the gymnast?",
#             use_frames=True, video_path=video_path)

### Using Entire Video

In [10]:
# query_video("describe the video in detail",
#             use_frames=False, video_path=f"{part_path}/000.mkv")

In [11]:
query_video("Given the scorebug on the top left of the video, at what time in the game does the first foul occur?",
            use_frames=False, video_path=f"{part_path}/000.mkv")

Using entire video for inference.


qwen-vl-utils using decord to read video.


Video input shape: torch.Size([306, 3, 252, 448])
# of video tokens: 22032
['The first foul occurs at 0:00:08.']


In [12]:
query_video("At what time in the game does the first foul occur?",
            use_frames=False, video_path=f"{part_path}/000.mkv")

Using entire video for inference.
Video input shape: torch.Size([306, 3, 252, 448])
# of video tokens: 22032
['The first foul occurs at 0:00:12.']


In [13]:
query_video("At what time in the game does the goalkeeper in orange make their first save?",
            use_frames=False, video_path=f"{part_path}/000.mkv")

Using entire video for inference.
Video input shape: torch.Size([306, 3, 252, 448])
# of video tokens: 22032
['The goalkeeper in orange makes their first save at the 0:25 mark.']


In [14]:
query_video("What is the scoreline at the end of the 4th minute?",
            use_frames=False, video_path=f"{part_path}/000.mkv")

Using entire video for inference.
Video input shape: torch.Size([306, 3, 252, 448])
# of video tokens: 22032
['The scoreline at the end of the 4th minute is 0-0.']


In [15]:
query_video("Does the clip contain a goal?",
            use_frames=False, video_path=f"{part_path}/000.mkv")

Using entire video for inference.
Video input shape: torch.Size([306, 3, 252, 448])
# of video tokens: 22032
['Yes, the clip shows a goal being scored by a player in a blue uniform.']


In [18]:
query_video("What color jerseys are the goalkeepers wearing?",
            use_frames=False, video_path=f"{part_path}/000.mkv")

Using entire video for inference.
Video input shape: torch.Size([306, 3, 252, 448])
# of video tokens: 22032
['The goalkeepers are wearing yellow jerseys.']
