<a href="https://colab.research.google.com/github/sangjunyoo-phd/Video-Recognition-with-Gemma3-4b/blob/main/Video_Recognition_with_Gemma3_4b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import transformers
from transformers import pipeline, Gemma3ForConditionalGeneration, AutoProcessor
import torch # datatype -> torch.bfloat16
import cv2
from google.colab.patches import cv2_imshow
import numpy as np
from PIL import Image

print(transformers.__version__)

In [None]:
!huggingface-cli login

In [None]:
model_name = "google/gemma-3-4b-it"

In [None]:
model = Gemma3ForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype = torch.bfloat16,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_name)

In [None]:
pipe = pipeline("image-text-to-text",
                model=model,
                processor=processor,
                device_map="auto")

In [None]:
# Change to the path of your video
video_path = "/content/drive/MyDrive/Colab Notebooks/WIN_20250402_21_02_19_Pro.mp4"

In [None]:
video = cv2.VideoCapture(video_path)
frames = []
timestamp_in_sec = []
while video.isOpened():
    ret, frame = video.read()
    if not ret:
        break
    frames.append(frame)
    timestamp_in_ms = video.get(cv2.CAP_PROP_POS_MSEC)
    timestamp_in_sec.append(timestamp_in_ms / 1000)

video.release()

print(f"Number of Frames: {len(frames)}\tShape of Frame: {frames[0].shape}")

In [None]:
def sample_frames(frames, timestamp_in_sec, num_frames, resize=False):
  # Extract number of frames to save memory
  sampled_frames = []
  sampled_timestamp = []
  frame_indices = np.linspace(0, len(frames) - 1, num_frames, dtype=int)
  for i in frame_indices:
    frame = frames[i]
    timestamp = timestamp_in_sec[i]
    sampled_timestamp.append(timestamp)
    original_height, original_width = frame.shape[:2]
    if resize: # Might be helpful to save memeory
      resized_frame = cv2.resize(frame, (original_width//2, original_height//2))
      sampled_frames.append(resized_frame)
    else:
      sampled_frames.append(frame)
  return sampled_frames, sampled_timestamp

In [None]:
sampled_frames, sampled_timestamp = sample_frames(frames, timestamp_in_sec, 20, resize=True)

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "What is happening in the video?"
                }
            # Append HERE!
            # The image should be a valid URL or a path to an image file
            # Save frames to the temp_frame folder during iteration!
        ]
    }
]

In [None]:
for i, frame in enumerate(sampled_frames):
  image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
  image.save(f"/content/frame_{i}.jpg")

  time_in_sec = round(sampled_timestamp[i], 2)

  # Append the prompt
  messages[-1]["content"].append(
      {
          "type": "text",
          "text": f"timestamp of the following frame is {time_in_sec} sec."
      }
  )
  messages[-1]["content"].append(
      {
          "type": "image",
          "image": f"/content/frame_{i}.jpg"
      }
  )

In [None]:
output = pipe(text=messages, max_new_tokens = 200)
print(output[0]["generated_text"][-1]['content'])