In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install transformers



In [None]:

import torch
from transformers import OwlViTProcessor, OwlViTForObjectDetection
import clip
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import time
import numpy as np

In [None]:
# Load models
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
owlvit_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

device = "cuda" if torch.cuda.is_available() else "cpu"
owlvit_model.to(device)

clip_model, preprocess = clip.load("ViT-B/32", device)


In [None]:

def process_video_with_owlvit(video_path, text_labels):
    cap = cv2.VideoCapture(video_path)
    fps_list = []

    ret, frame = cap.read()

    height, width = frame.shape[:2]
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter('output_with_owlvit.avi', fourcc, 2.0, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        start = time.time()
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        # Pass the frame and text labels into OwlViT model
        inputs = processor(text=text_labels, images=image, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = owlvit_model(**inputs)

        # Post-processing step
        target_sizes = torch.tensor([(image.height, image.width)]).to(device)
        results = processor.post_process_grounded_object_detection(
            outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
        )[0]

        boxes, scores, proc_text_labels = results["boxes"], results["scores"], results["text_labels"]

        # Annotate the frame with bounding boxes and labels using OpenCV (for video output)
        for box, score, label in zip(boxes, scores, proc_text_labels):
            xmin, ymin, xmax, ymax = map(int, box.tolist())
            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)
            cv2.putText(frame, f"{label}: {score:.2f}", (xmin, ymin - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

        out.write(frame)

        fig, ax = plt.subplots(1, figsize=(12, 8))
        ax.imshow(frame)
        ax.axis('off')
        plt.tight_layout()
        plt.show()

        plt.close(fig)

        fps_list.append(1 / (time.time() - start))

    cap.release()
    out.release()
    print(f"Video saved as 'output_with_owlvit.avi'")
    print(f"Average FPS: {sum(fps_list)/len(fps_list):.2f}")

In [None]:
def process_video_with_clip(video_path, text_labels):
    # Open the video
    cap = cv2.VideoCapture(video_path)
    fps_list = []

    ret, frame = cap.read()

    height, width = frame.shape[:2]
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)  # Reset to first frame

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter('output_with_clip.avi', fourcc, 2.0, (width, height))

    # Preprocess text labels
    text_inputs = torch.cat([clip.tokenize([txt]) for txt in text_labels]).to(device)

    # Process the video frame by frame
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        start = time.time()

        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        # Preprocess image for CLIP
        image_input = preprocess(image).unsqueeze(0).to(device)

        # Get image and text features
        with torch.no_grad():
            image_features = clip_model.encode_image(image_input)
            text_features = clip_model.encode_text(text_inputs)

        # Normalize features and calculate cosine similarity
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        similarity = (image_features @ text_features.T).squeeze(0)  # Cosine similarity

        # Get the top matching text label
        top_match_idx = similarity.argmax().item()
        top_match_score = similarity[top_match_idx].item()

        # Annotate the frame with the top matching label and score
        cv2.putText(frame, f"Top match: {text_labels[top_match_idx]} ({top_match_score:.2f})",
                    (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

        # Display the frame in the notebook (using matplotlib)
        fig, ax = plt.subplots(1, figsize=(12, 8))
        ax.imshow(frame)  # Show the frame with text annotations
        ax.axis('off')
        plt.tight_layout()
        plt.show()  # Display the plot inline

        # Write the frame (with annotation) to the video output using OpenCV
        out.write(frame)  # Write the frame directly to the output video

        plt.close(fig)  # Close the plot to free memory

        end = time.time()
        fps_list.append(1 / (end - start))

    cap.release()
    out.release()
    print(f"Video saved as 'output_with_clip.avi'")
    print(f"Average FPS: {sum(fps_list)/len(fps_list):.2f}")

In [None]:
video_path = '/content/input_vid.avi'
text_labels = [["matchbox and matchsticks", "pc monitor", "lion", "drone", "light bulb"]]
process_video_with_owlvit(video_path, text_labels)


In [None]:
text_labels = ["matchbox and matchsticks", "pc monitor", "lion", "drone", "light bulb"]

process_video_with_clip(video_path, text_labels)