In [1]:
!pip install transformers
!pip install huggingface_hub
!pip install datasets
!pip install gradio
!pip install ultralytics

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

# New Section

In [2]:
import pickle
from transformers import AutoProcessor, AutoModelForImageTextToText, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from ultralytics import YOLO
from PIL import Image
import re
import openai
import joblib

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
from huggingface_hub import login
from google.colab import userdata

# Fetch your Hugging Face API key from the secret
hug_api_key = userdata.get('HF_TOKEN')

# Log in with the API key
login(hug_api_key)

In [6]:
from datasets import load_dataset

# Load dataset (using Hugging Face dataset)
dataset = load_dataset("philschmid/amazon-product-descriptions-vlm")




README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/47.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1345 [00:00<?, ? examples/s]

In [7]:
# Initialize YOLO Model for Object Detection
def initialize_yolo_model():
    yolo_model = YOLO("yolov8x.pt")  # Load YOLOv8 model
    return yolo_model

yolo_model = initialize_yolo_model()

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8x.pt to 'yolov8x.pt'...


100%|██████████| 131M/131M [00:00<00:00, 288MB/s]


In [8]:
# 1. Detect Main Object in the Image
def detect_main_object(image):
    results = yolo_model(image)
    # Extract labels of detected objects
    detected_objects = results[0].names if results[0].names else []
    # Choose the most prominent detected object
    if detected_objects:
        product_name = detected_objects[0].capitalize()
    else:
        product_name = "Unknown Object"
    return product_name

In [9]:
# Load the BLIP processor and model (image captioning)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [10]:
class ProductDescriptionDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, processor, max_len=128):
        self.dataset = dataset
        self.processor = processor
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        # Concatenate multiple columns into a single text string
        text = f"{item['Product Name']} | "
        text += f"{item['Category']} | "
        text += f"{item['Product Specification']} | "
        text += f" {item['About Product']} | "
        text += f"{item['description']}"

        # Load and preprocess the image
        image = item['image']
        image_inputs = self.processor(images=image, return_tensors="pt")

        # Tokenize the concatenated description
        text_inputs = self.processor.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )

        return {
            'image': image_inputs['pixel_values'].squeeze(0),
            'text': text_inputs['input_ids'].squeeze(0),
        }


In [11]:

from torch.utils.data import DataLoader
train_dataset = ProductDescriptionDataset(dataset['train'], processor)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [12]:
# Optimizer and loss
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-6)

In [13]:
from tqdm import tqdm

# Fine-tune the BLIP model with the correct descriptions
epochs =10
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
        image_inputs = batch['image'].to(device)
        text_inputs = batch['text'].to(device)
        outputs = model(input_ids=text_inputs, pixel_values=image_inputs, labels=text_inputs)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

Epoch 1/10:   2%|▏         | 3/169 [00:04<04:04,  1.47s/it]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1/10: 100%|██████████| 169/169 [03:51<00:00,  1.37s/it]


Epoch 1/10 - Loss: 4.7240


Epoch 2/10: 100%|██████████| 169/169 [03:52<00:00,  1.38s/it]


Epoch 2/10 - Loss: 3.0477


Epoch 3/10: 100%|██████████| 169/169 [03:52<00:00,  1.38s/it]


Epoch 3/10 - Loss: 2.5102


Epoch 4/10: 100%|██████████| 169/169 [03:52<00:00,  1.38s/it]


Epoch 4/10 - Loss: 2.1367


Epoch 5/10: 100%|██████████| 169/169 [03:53<00:00,  1.38s/it]


Epoch 5/10 - Loss: 1.8854


Epoch 6/10: 100%|██████████| 169/169 [03:52<00:00,  1.38s/it]


Epoch 6/10 - Loss: 1.7262


Epoch 7/10: 100%|██████████| 169/169 [03:52<00:00,  1.38s/it]


Epoch 7/10 - Loss: 1.6011


Epoch 8/10: 100%|██████████| 169/169 [03:53<00:00,  1.38s/it]


Epoch 8/10 - Loss: 1.4849


Epoch 9/10: 100%|██████████| 169/169 [03:55<00:00,  1.39s/it]


Epoch 9/10 - Loss: 1.3746


Epoch 10/10: 100%|██████████| 169/169 [03:54<00:00,  1.39s/it]

Epoch 10/10 - Loss: 1.2787





In [14]:

# Save the  model and processor

model_path = '/content/drive/MyDrive/fine_tuned_model.pkl'  # Change the path as needed
joblib.dump(model, model_path)

# Save the processor as well
processor_path = '/content/drive/MyDrive/fine_tuned_processor.pkl'
joblib.dump(processor, processor_path)

print("Model and processor saved to Google Drive!")
# Save the fine-tuned model and processor in pickle format


Model and processor saved to Google Drive!


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://da7e40c2f9c4d9c92a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



0: 640x640 1 bottle, 79.5ms
Speed: 32.4ms preprocess, 79.5ms inference, 142.6ms postprocess per image at shape (1, 3, 640, 640)
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://da7e40c2f9c4d9c92a.gradio.live




In [None]:
# # Function to compute accuracy
# def compute_accuracy(generated_description, ground_truth):
#     if not ground_truth:
#         return "No Ground Truth Provided"
#     generated_words = set(generated_description.lower().split())
#     ground_truth_words = set(ground_truth.lower().split())
#     overlap = len(generated_words & ground_truth_words)
#     total = len(ground_truth_words)
#     return f"{round((overlap / total) * 100, 2)}%"

# # Define Gradio interface
# def generate_and_evaluate(image, ground_truth):
#     product_name, product_description = generate_product_description(image)
#     accuracy = compute_accuracy(product_description, ground_truth)
#     return product_name, product_description, accuracy

# def collect_feedback(product_name, product_description, feedback):
#     with open("feedback.txt", "a") as feedback_file:
#         feedback_file.write(
#             f"Product Name: {product_name}\nDescription: {product_description}\nFeedback: {feedback}\n{'-'*50}\n"
#         )
#     return "Thank you for your feedback!"

In [None]:

# # Create Gradio Blocks interface
# with gr.Blocks() as demo:
#     gr.HTML("<h1>VisonClarity</h1>")
#     gr.Markdown(
#         "Upload an image, and optionally provide ground truth for accuracy evaluation. "
#         "You can also provide feedback on the generated outputs."
#     )

#     with gr.Tab("Generate Description"):
#         product_image = gr.Image(label="Upload Product Image")
#         ground_truth = gr.Textbox(
#             label="Ground Truth (Optional)",
#             placeholder="Provide ground truth description for accuracy calculation",
#         )
#         product_name = gr.Textbox(label="Generated Product Name", interactive=False)
#         product_description = gr.Textbox(label="Generated Product Description", interactive=False)
#         accuracy_score = gr.Textbox(label="Accuracy", interactive=False)
#         generate_button = gr.Button("Generate and Evaluate")
#         generate_button.click(
#             generate_and_evaluate,
#             inputs=[product_image, ground_truth],
#             outputs=[product_name, product_description, accuracy_score],
#         )

#     with gr.Tab("Provide Feedback"):
#         user_feedback = gr.Textbox(label="Your Feedback", placeholder="What do you think about the generated results?")
#         submit_button = gr.Button("Submit Feedback")
#         submit_button.click(
#             collect_feedback,
#             inputs=[product_name, product_description, user_feedback],
#             outputs="text",
#         )

# demo.launch(share=True,debug=True)
