In [None]:
!pip install torch torchvision transformers Pillow accelerate datasets einops

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
from datasets import Dataset, Features, Value, Image as DatasetImage
import os
from tqdm import tqdm

# Ensure to enable GPU usage
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model and tokenizer with CPU offload enabled
model_id = "vikhyatk/moondream2"
revision = "2024-08-26"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    revision=revision,
    device_map=device,  # Specify folder for offloading to CPU
    offload_state_dict=True  # Enable offloading for state dict
)

tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

In [None]:
from datasets import load_dataset
images_ds=load_dataset("Subh775/Extracted_Movies_dataset",split="train[:10]")
images_ds

In [None]:
import requests
from tqdm import tqdm
from datasets import load_dataset
from PIL import Image
from io import BytesIO

def generate_caption(image, model, tokenizer, device):
    # image = Image.open(image_path).convert('RGB')  # Ensure image is in RGB mode
    enc_image = model.encode_image(image).to(device)  # Encode image and move to device

    question = "Include the type of sale, colors, main design elements, layout, and any text that is visible in the image ,but exclude any promotional text or filler text "

    # Generate the caption using the model
    caption = model.answer_question(enc_image, question, tokenizer)

    return caption


def fetch_image(image_url):
    try:
        # Stream the request for efficient memory usage
        response = requests.get(image_url, stream=True)
        response.raise_for_status()

        # Load and convert image to RGB
        image = Image.open(BytesIO(response.content)).convert("RGB")
        return image

    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return None
    except (Image.UnidentifiedImageError, IOError) as e:
        print(f"Image processing error: {e}")
        return None

def process_images(images_ds, model, tokenizer, device):
    dataset_dict = {
        "image": [],
        "caption": []
    }

    total_images = len(images_ds)

    with tqdm(total=total_images, desc="Processing All Images") as pbar:
        for entry in images_ds:
            image_url = entry['Poster']

            image = fetch_image(image_url)
            if image:
                try:
                    caption = generate_caption(image, model, tokenizer, device)
                    dataset_dict["image"].append(image)
                    dataset_dict["caption"].append(caption)
                except Exception as e:
                    print(f"Error generating caption for image from {image_url}: {e}")

            pbar.update(1)

    return dataset_dict

# images_ds = load_dataset("Subh775/Extracted_Movies_dataset", split="train").select(range(1500))
# dataset_dict = process_images(images_ds, model, tokenizer, device)


In [None]:
# image_directory = r'/kaggle/working/'  # Replace with your base directory containing subfolders

# Process all images across folders
dataset_dict = process_images(images_ds, model, tokenizer, device)

In [None]:
# Define dataset features
features = Features({
    "image": DatasetImage(),  # Define image field using Hugging Face dataset Image format
    "caption": Value("string")  # Captions as text
})

# Create dataset from dictionary
dataset = Dataset.from_dict(dataset_dict, features=features)

dataset.push_to_hub("Subh775/movies_caption",token="Replace_with_your_tokens")

!pkill jupyter

In [None]:
features = Features({
    "image": DatasetImage(),  # Define image field using Hugging Face dataset Image format
    "caption": Value("string")  # Captions as text
})

# Create dataset from dictionary
dataset = Dataset.from_dict(dataset_dict, features=features)

In [None]:
#view the caption generated by the loaded model for the image [index]
dataset[8]