In [2]:
from huggingface_hub import login
 
login(
  
  
  token="", # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)

Token has not been saved to git credential helper.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


In [1]:
import gradio as gr
import os
import cv2
import csv
import time
from PIL import Image
import torch
from ultralytics import YOLO
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from flash_attn import flash_attn_qkvpacked_func, flash_attn_func

# Model and directories
image_dir = '/teamspace/studios/this_studio/Imagesflipkart'
output_cropped_dir = '/teamspace/studios/this_studio/cropped_images'
output_csv_path = 'detections_output.csv'
adapter_path = "/teamspace/studios/this_studio/newdescripterckp/checkpoint-241"
model_path = "/teamspace/studios/this_studio/best.pt"
threshold = 0.4

# Set up YOLO model
yolo_model = YOLO(model_path)

# Set up Qwen model
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    cache_dir="/teamspace/studios/this_studio/newdescripterckp"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", cache_dir="/teamspace/studios/this_studio/newdescripterckp", max_pixels=720*28*28)
model.load_adapter(adapter_path)  # Load adapter and activate

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [3]:


def process_image(image):
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    output_text = ""
    
    # Start YOLO inference
    start_yolo_time = time.time()
    results = yolo_model(image)[0]
    end_yolo_time = time.time()
    yolo_inference_time = end_yolo_time - start_yolo_time
    
    for result in results.boxes.data.tolist():
        x1, y1, x2, y2, score, class_id = result

        if score > threshold:
            # Crop detected object
            cropped_image = image[int(y1):int(y2), int(x1):int(x2)]
            cropped_pil_image = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))

            # Start Qwen inference
            start_qwen_time = time.time()
            qwen_output = process_cropped_image_with_qwen(cropped_pil_image)
            end_qwen_time = time.time()
            qwen_inference_time = end_qwen_time - start_qwen_time

            # Append the output
            output_text += f"**Qwen Output:** {qwen_output}\n"
            # output_text += f"**YOLO Inference Time:** {yolo_inference_time:.2f} seconds\n"
            # output_text += f"**Qwen Inference Time:** {qwen_inference_time:.2f} seconds\n\n"

    return output_text

def process_cropped_image_with_qwen(cropped_pil_image):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": cropped_pil_image},
                {"type": "text", "text": "Identify the brand name, product type, expiry date, manufacturing date, quantity only."}
            ]
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate output from the Qwen model
    generated_ids = model.generate(**inputs, max_new_tokens=70)
    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    
    return output_text[0]

# Gradio UI
def run_gradio_interface(image):
    output = process_image(image)
    return output

# Define Gradio interface with image input and text output
with gr.Blocks() as demo:
    
    # Add a title and description
    gr.Markdown("## Product Inference with YOLO and Qwen")
    gr.Markdown(
        """
        **Upload an image of a product**, and this tool will run object detection with the YOLO model 
        to identify the region of interest. Then, it will use the Qwen model to describe key product details, 
        including the brand name, product type, and quantity. The YOLO and Qwen inference times will also be displayed.
        """
    )
    
    # Define the layout for the interface
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="numpy", label="Upload Image")
            submit_button = gr.Button("Run Inference")
        
        with gr.Column():
            output_box = gr.Textbox(label="Processed Output", lines=10, max_lines=20)

    # Add interaction to trigger inference
    submit_button.click(fn=run_gradio_interface, inputs=image_input, outputs=output_box)

# Launch the app
demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://1f21cd1f6edd07b0d9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)





0: 640x384 1 product, 73.0ms
Speed: 2.6ms preprocess, 73.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 product, 71.6ms
Speed: 2.4ms preprocess, 71.6ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)


In [2]:
# Function to draw bounding boxes on the image
def draw_bounding_boxes(image, results):
    for result in results.boxes.data.tolist():
        x1, y1, x2, y2, score, class_id = result

        if score > threshold:
            # Draw rectangle for the bounding box
            cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 4)
            # Add label
            cv2.putText(image, f"Object {int(class_id)}", (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    return image

# Function to process the image with YOLO and Qwen
def process_image(image):
    image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    # image_bgr = cv2.resize(image_bgr,(1080,1080))
    output_text = ""
    
    # Start YOLO inference
    start_yolo_time = time.time()
    results = yolo_model(image_bgr)[0]
    end_yolo_time = time.time()
    yolo_inference_time = end_yolo_time - start_yolo_time
    
    # Draw bounding boxes on the image
    image_with_boxes = draw_bounding_boxes(image_bgr.copy(), results)
    image_with_boxes_rgb = cv2.cvtColor(image_with_boxes, cv2.COLOR_BGR2RGB)  # Convert back to RGB for display

    for result in results.boxes.data.tolist():
        x1, y1, x2, y2, score, class_id = result

        if score > threshold:
            # Crop detected object
            cropped_image = image_bgr[int(y1):int(y2), int(x1):int(x2)]
            cropped_pil_image = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))

            # Start Qwen inference
            start_qwen_time = time.time()
            qwen_output = process_cropped_image_with_qwen(cropped_pil_image)
            end_qwen_time = time.time()
            qwen_inference_time = end_qwen_time - start_qwen_time

            # Append the output
            output_text += f"**Qwen Output:** {qwen_output}\n"
            # output_text += f"**YOLO Inference Time:** {yolo_inference_time:.2f} seconds\n"
            # output_text += f"**Qwen Inference Time:** {qwen_inference_time:.2f} seconds\n\n"

    return image_with_boxes_rgb, output_text

def process_cropped_image_with_qwen(cropped_pil_image):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": cropped_pil_image},
                {"type": "text", "text": "Identify the brand name, product type, expiry date, manufacturing date, quantity only."}
            ]
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate output from the Qwen model
    generated_ids = model.generate(**inputs, max_new_tokens=70)
    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    
    return output_text[0]

# Gradio UI
def run_gradio_interface(image):
    image_with_boxes, output_text = process_image(image)
    return image_with_boxes, output_text

# Define Gradio interface with image input, image output, and text output
with gr.Blocks() as demo:
    
    # Add a title and description
    gr.Markdown("## Product Inference with YOLO and Qwen")
    gr.Markdown(
        """
        **Upload an image of a product**, and this tool will run object detection with the YOLO model 
        to identify the region of interest. It will display the bounding boxes on the image. 
        Then, it will use the Qwen model to describe key product details, including the brand name, 
        product type, and quantity. The YOLO and Qwen inference times will also be displayed.
        """
    )
    
    # Define the layout for the interface
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="numpy", label="Upload Image")  # Image upload box
            submit_button = gr.Button("Run Inference")  # Submit button
        
        with gr.Column():
            image_output = gr.Image(label="Image with Bounding Boxes")  # Image output with bounding boxes
            output_box = gr.Textbox(label="Processed Output", lines=10, max_lines=20)  # Text output

    # Add interaction to trigger inference
    submit_button.click(fn=run_gradio_interface, inputs=image_input, outputs=[image_output, output_box])

# Launch the app
demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://d8c2ab7f9944665c1d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)





0: 640x384 1 product, 73.8ms
Speed: 8.3ms preprocess, 73.8ms inference, 464.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x480 1 product, 73.8ms
Speed: 4.4ms preprocess, 73.8ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 480)

0: 640x384 1 product, 70.6ms
Speed: 1.6ms preprocess, 70.6ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)
