### Let's download the model checkpoint

In [None]:
!pip install -q transformers flash_attn timm einops peft

### Imports

In [None]:
# @title Imports
import cv2
import io
import os
import re
import json
import torch
import html
import base64
import itertools

import numpy as np
import supervision as sv

from google.colab import userdata
from IPython.core.display import display, HTML
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    AutoModelForCausalLM,
    AutoProcessor,
    get_scheduler
)
from tqdm import tqdm
from typing import List, Dict, Any, Tuple, Generator
from peft import LoraConfig, get_peft_model
from PIL import Image
from roboflow import Roboflow

### Load the model using AutoModelForCausalLM and the processor using AutoProcessor classes from the transformers library.

In [None]:
CHECKPOINT = "microsoft/Florence-2-base-ft"
REVISION = 'refs/pr/6'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=True, revision=REVISION).to(DEVICE)
processor = AutoProcessor.from_pretrained(CHECKPOINT, trust_remote_code=True, revision=REVISION)

### Create Folder

In [None]:
output_folder = "processed_frames"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

### Extract Frame and Find Details Caption from the Image and Use that Caption for Caption-to-Phrase Grounding

In [None]:
# Create a folder to store the processed images
output_folder = "processed_frames2"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

task = "<DETAILED_CAPTION>"
text = "<DETAILED_CAPTION>"

cap = cv2.VideoCapture('/content/input_video2.mp4')

frame_number = 1
frame_and_caption = {}  # Initialize dictionary to store frame numbers and captions
frame_count = 0

while True:
    ret, frame = cap.read()

    if not ret:
        break  # Exit the loop if no more frames are available


    
    frame_count += 1
    if frame_count % 10 == 0:
      
      
      
      inputs = processor(text=text, images=frame, return_tensors="pt").to(DEVICE)
      generated_ids = model.generate(
              input_ids=inputs["input_ids"],
              pixel_values=inputs["pixel_values"],
              max_new_tokens=1024,
              num_beams=3
          )

      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

      response = processor.post_process_generation(generated_text, task=task, image_size=(frame.shape[1], frame.shape[0]))

          # Store the generated caption for the current frame
      if '<DETAILED_CAPTION>' in response:
              frame_and_caption[frame_number] = response['<DETAILED_CAPTION>']


      task_2 = "<CAPTION_TO_PHRASE_GROUNDING>"
      text_2 = f"<CAPTION_TO_PHRASE_GROUNDING> {response['<DETAILED_CAPTION>']}"

      inputs = processor(text=text_2, images=frame, return_tensors="pt").to(DEVICE)
      generated_ids_2= model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3
        )
      generated_text_2 = processor.batch_decode(generated_ids_2, skip_special_tokens=False)[0]
      response_2 = processor.post_process_generation(generated_text_2, task=task_2, image_size=(frame.shape[1], frame.shape[0]))

      bboxes = response_2['<CAPTION_TO_PHRASE_GROUNDING>']['bboxes']
      labels = response_2['<CAPTION_TO_PHRASE_GROUNDING>']['labels']

      for bbox, label in zip(bboxes, labels):
                # Extract bounding box coordinates
              x1, y1, x2, y2 = [int(coord) for coord in bbox]

                # Draw the rectangle for the bounding box
              cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

                # Put the label text above the bounding box
              cv2.putText(frame, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)


      processed_frame = frame  # Store the frame in a variable
      output_filename = f"{output_folder}/frame_{frame_number}.jpg"
      cv2.imwrite(output_filename, processed_frame)  # Save the processed image to the folder

      frame_number += 1


    # Break the loop on pressing 'Esc'
    key = cv2.waitKey(1)
    if key == 27:
      break

cap.release()
cv2.destroyAllWindows()

# Print the stored frame and caption data
print(frame_and_caption)

### Download output image 

In [None]:
import shutil
import os

# Path to the folder and zip file
folder_path = '/content/processed_frames2'
zip_path = '/content/processed_frames2.zip'

# Remove the existing zip file if it exists
if os.path.exists(zip_path):
    os.remove(zip_path)

# Create a zip file of the folder
shutil.make_archive('/content/processed_frames2', 'zip', folder_path)

# Download the zip file
from google.colab import files
files.download(zip_path)