In [1]:
import os
import cv2
import time
import gradio as gr
from PIL import Image
import torch
from ultralytics import YOLO
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Model and directories
output_cropped_dir = 'saved_frames'  # Output directory for cropped frames
adapter_path = "/teamspace/studios/this_studio/newdescripterckp/checkpoint-241"
model_path = "/teamspace/studios/this_studio/video_v2.pt"
threshold = 0.4

# Set up YOLO model
yolo_model = YOLO(model_path)

# Set up Qwen model
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    cache_dir="/teamspace/studios/this_studio/newdescripterckp"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", cache_dir="/teamspace/studios/this_studio/newdescripterckp", max_pixels=1080*28*28)
model.load_adapter(adapter_path)  # Load adapter and activate

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
# Ensure output directories exist
if not os.path.exists(output_cropped_dir):
    os.makedirs(output_cropped_dir)

# Function to reduce bounding box and mask
def reduce_bounding_box(x1, y1, x2, y2, reduction_factor=0.05):
    width = x2 - x1
    height = y2 - y1
    x1_new = x1 + int(reduction_factor * width)
    y1_new = y1 + int(reduction_factor * height)
    x2_new = x2 - int(reduction_factor * width)
    y2_new = y2 - int(reduction_factor * height)
    return x1_new, y1_new, x2_new, y2_new

def mask_reduced_bounding_box(frame, x1, y1, x2, y2):
    frame[int(y1):int(y2), int(x1):int(x2)] = 0  # Set the reduced box area to zero (black)
    return frame

# Function to process each object and run Qwen inference
def process_cropped_object(cropped_pil_image, frame_count, object_count):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": cropped_pil_image},
                {"type": "text", "text": "Identify the brand name, product type, expiry date, manufacturing date, quantity only."}
            ]
        }
    ]
    
    # Prepare input for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate output from the Qwen model
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)

     # Format the output nicely for each object
    formatted_output = (
        f"Inference Output for Frame {frame_count}, Object {object_count}:\n"
        f"----------------------------------------------------\n"
        f"{output_text[0]}\n"
        f"----------------------------------------------------\n"
    )

    # Print the output after every inference
    # print(formatted_output)

    return formatted_output  # Returning formatted output for further use

    # Return the output
    # return f"Inference Output for Frame {frame_count}, Object {object_count}:\n{output_text[0]}"

# Function to run YOLO on a frame and count the number of detected objects
def count_objects_in_frame(frame):
    results = yolo_model(frame)[0]
    object_count = len(results.boxes.data.tolist())
    return object_count, results

# Function to process video and filter frames with the most detected objects
def process_video_and_filter_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return "Error: Could not open video."
    
    video_fps = cap.get(cv2.CAP_PROP_FPS)  # Original video FPS
    frame_interval = int(video_fps)

    frame_count = 0
    max_objects = 0
    selected_frame = None
    selected_results = None
    output_text = ""

    while True:
        ret, frame = cap.read()
        if not ret:
            break  # Exit the loop if there are no more frames

        if frame_count % frame_interval == 0:
            object_count, yolo_results = count_objects_in_frame(frame)
            print(f"Frame {frame_count} has {object_count} objects.")

            if object_count > max_objects:
                max_objects = object_count
                selected_frame = frame.copy()
                selected_results = yolo_results

        frame_count += 1

    cap.release()
    cv2.destroyAllWindows()

    if selected_frame is not None and selected_results is not None:
        output_text = process_frame_with_max_objects(selected_frame, selected_results)

    return output_text

# Function to process the frame with the most detected objects, crop them, and run inference one by one
def process_frame_with_max_objects(frame, results):
    object_count = 0
    output_text = ""

    for result in results.boxes.data.tolist():
        x1, y1, x2, y2, score, class_id = result
        if score > threshold:
            object_count += 1

            # Reduce the bounding box before cropping
            x1_new, y1_new, x2_new, y2_new = reduce_bounding_box(x1, y1, x2, y2)

            cropped_image = frame[int(y1_new):int(y2_new), int(x1_new):int(x2_new)]
            cropped_pil_image = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))

            # Run Qwen inference on the cropped image, one by one
            output_text += process_cropped_object(cropped_pil_image, "max_frame", object_count)

            # Mask the reduced bounding box area in the frame
            frame = mask_reduced_bounding_box(frame, x1_new, y1_new, x2_new, y2_new)

    return output_text

# Gradio function to process video and display results
def run_inference_on_video(video_file):
    output_text = process_video_and_filter_frames(video_file)
    return output_text

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## Video Inference with YOLO and Qwen")
    
    video_input = gr.Video(label="Upload Video")
    output_textbox = gr.Textbox(label="Inference Output")
    
    submit_button = gr.Button("Run Inference")
    submit_button.click(run_inference_on_video, inputs=video_input, outputs=output_textbox)

# Launch Gradio UI
demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://24ac92ace7650ee2ff.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)





0: 384x640 1 product, 75.3ms
Speed: 5.8ms preprocess, 75.3ms inference, 325.3ms postprocess per image at shape (1, 3, 384, 640)
Frame 0 has 1 objects.

0: 384x640 1 product, 6.9ms
Speed: 1.5ms preprocess, 6.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Frame 30 has 1 objects.

0: 384x640 1 product, 6.6ms
Speed: 1.7ms preprocess, 6.6ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)
Frame 60 has 1 objects.

0: 384x640 2 products, 6.7ms
Speed: 2.0ms preprocess, 6.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)
Frame 90 has 2 objects.

0: 384x640 2 products, 6.7ms
Speed: 1.4ms preprocess, 6.7ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)
Frame 120 has 2 objects.

0: 384x640 3 products, 6.6ms
Speed: 1.4ms preprocess, 6.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Frame 150 has 3 objects.

0: 384x640 4 products, 6.7ms
Speed: 1.7ms preprocess, 6.7ms inference, 1.1ms postprocess per imag