In [7]:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image, ImageDraw
import requests
import cv2
import numpy as np
import time
from IPython.display import display
from moviepy import *
import os
from concurrent.futures import ThreadPoolExecutor
from itertools import chain
from tqdm import tqdm
import json
import copy

In [10]:
VIDEO_PATH = "lainconn/Hackathon-Peleng/train/videoset1"
MODEL_PATH = "google/owlv2-base-patch16-ensemble"
BATCH_SIZE = 1
VIDEO_LENGTH = 5 # in seconds
VIDEO_STEP = 1 # in seconds

In [11]:
timestamps = np.arange(0, VIDEO_LENGTH, VIDEO_STEP)
video_paths = [os.path.join(VIDEO_PATH, video) for video in os.listdir(VIDEO_PATH)]

def extract_frame_at_timestamp(video_path, timestamp):
    cap = cv2.VideoCapture(video_path)
    
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_number = int(timestamp * fps)
    
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
    
    ret, frame = cap.read()
    if not ret:
        cap.release()
        return None
    
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    cap.release()
    
    frame_image = Image.fromarray(frame_rgb)
    return frame_image

def process_video(video_path):
    frames = []
    for timestamp in timestamps:
        frame_image = extract_frame_at_timestamp(video_path, timestamp)
        if frame_image:
            frames.append(frame_image)
    return frames

with ThreadPoolExecutor() as executor:
    results = list(executor.map(process_video, video_paths))

img_list = list(chain.from_iterable(results))

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'lainconn/Hackathon-Peleng/train/videoset1'

In [6]:
image_processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_PATH).to("cuda")

preprocessor_config.json:   0%|          | 0.00/427 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

In [10]:
class CustomDataset(Dataset):
    def __init__(self, data_dict, text_queries, image_processor):
        self.pixel_values = []
        self.input_ids = []
        self.attention_masks = []

        for img, query in zip(data_dict[85:150], text_queries[85:150]):
            inputs = image_processor(text=query, images=img, return_tensors="pt")
            self.pixel_values.append(inputs["pixel_values"].squeeze(0))
            self.input_ids.append(inputs["input_ids"].squeeze(0))
            self.attention_masks.append(inputs["attention_mask"].squeeze(0))

    def __len__(self):
        return len(self.pixel_values)

    def __getitem__(self, idx):
        return {
            "pixel_values": self.pixel_values[idx],
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx]
        }

text_queries = [["a small dark spherical object"] for i in range(len(img_list))]
dataset = CustomDataset(img_list,text_queries,image_processor)
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, pin_memory=True)

In [16]:
"""text_queries = [["a small green spherical object"] for i in range(len(img_list))]
inputs = image_processor(text=text_queries, images=img_list, return_tensors="pt")"""

In [17]:
"""class CustomDataset(Dataset):
    def __init__(self, data_dict):
        self.pixel_values = data_dict["pixel_values"]
        self.input_ids = data_dict["input_ids"]
        self.attention_mask = data_dict["attention_mask"]

    def __len__(self):
        return len(self.pixel_values)

    def __getitem__(self, idx):
        return {
            "pixel_values": self.pixel_values[idx],
            "input_ids": self.input_ids[idx],
            "attention_mask" : self.attention_mask[idx]
        }

dataset = CustomDataset(inputs)
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE)"""

In [11]:
total_output = []
with torch.no_grad():
    for batch in tqdm(data_loader):
        for k,v in batch.items():
            batch[k] = v.to("cuda")
        outputs = model(**batch)
        total_output.append(image_processor.post_process_object_detection(
            outputs, 
            threshold=0.1, 
            target_sizes=[(1080,1920)])
                           )

100%|██████████| 65/65 [01:27<00:00,  1.35s/it]


In [22]:
testing = copy.deepcopy(total_output)
for scene in testing:
    for key, value in scene[0].items():
        if isinstance(value, torch.Tensor):
            scene[0][key] = value.tolist()

In [23]:
box_inx = []

for sublist_index, sublist in enumerate(testing):
    for d in sublist:
        if "boxes" in d:
            box_inx.append(d["boxes"])

end_inx = len(img_list) // 3
output_json = {"video3":box_inx[:end_inx],
              "video2":box_inx[end_inx:end_inx*2],
              "video1":box_inx[end_inx*2:end_inx*3]}

In [24]:
output_path = "/kaggle/working/data.json"

with open(output_path, "w") as json_file:
    json.dump(output_json, json_file, indent=4)

In [12]:
output_video_path = "/kaggle/working/output.mp4"
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
fps = 1
frame_size = (img_list[0].width, img_list[0].height)
out = cv2.VideoWriter(output_video_path, fourcc, fps, frame_size)

for frame, result in zip(img_list, total_output):
    draw = ImageDraw.Draw(frame)
    boxes = result[0]["boxes"]
    scores = result[0]["scores"]
    labels = result[0]["labels"]

    for box, score, label in zip(boxes, scores, labels):
        x_min, y_min, x_max, y_max = box
        draw.rectangle([(x_min, y_min), (x_max, y_max)], outline="red", width=3)

    frame_cv2 = cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)
    out.write(frame_cv2)

out.release()