In [2]:
!pip install torch torchvision transformers Pillow accelerate datasets einops



In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import os
from tqdm import tqdm

# Ensure to enable GPU usage
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model and tokenizer with CPU offload enabled
model_id = "vikhyatk/moondream2"
revision = "2024-08-26"

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    revision=revision,
    device_map=device,  # Specify folder for offloading to CPU
    offload_state_dict=True  # Enable offloading for state dict
)

tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

configuration_moondream.py:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

moondream.py:   0%|          | 0.00/7.20k [00:00<?, ?B/s]

vision_encoder.py:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

modeling_phi.py:   0%|          | 0.00/63.1k [00:00<?, ?B/s]

region_model.py:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

fourier_features.py:   0%|          | 0.00/558 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.74G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [9]:
from datasets import load_dataset
images_ds=load_dataset("Subh775/Extracted_Movies_dataset",split="train[8000:9000]")
images_ds

Dataset({
    features: ['Title', 'description', 'Poster'],
    num_rows: 1000
})

In [10]:
import requests
from tqdm import tqdm
from datasets import load_dataset
from PIL import Image
from io import BytesIO
import os

def generate_caption(image, model, tokenizer, device):
    enc_image = model.encode_image(image).to(device)
    question = "Include the type of sale, colors, main design elements, layout, and any text that is visible in the image, but exclude any promotional text or filler text."
    caption = model.answer_question(enc_image, question, tokenizer)
    return caption

def fetch_image(image_url):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content)).convert("RGB")
        return image
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return None
    except (Image.UnidentifiedImageError, IOError) as e:
        print(f"Image processing error: {e}")
        return None

def resize_image(image, size=(256, 256)):
    return image.resize(size, Image.LANCZOS)

def process_images(images_ds, model, tokenizer, device, output_dir='resized_images'):
    os.makedirs(output_dir, exist_ok=True)
    dataset_dict = {
        "image": [],
        "caption": []
    }

    total_images = len(images_ds)

    with tqdm(total=total_images, desc="Processing All Images") as pbar:
        for entry in images_ds:
            image_url = entry['Poster']
            image = fetch_image(image_url)
            if image:
                try:
                    resized_image = resize_image(image)
                    caption = generate_caption(resized_image, model, tokenizer, device)

                    # Save the resized image if needed
                    image_name = os.path.basename(image_url)  # Use the base name of the image URL
                    resized_image_path = os.path.join(output_dir, image_name)
                    resized_image.save(resized_image_path)

                    dataset_dict["image"].append(resized_image_path)  # Store the path
                    dataset_dict["caption"].append(caption)
                except Exception as e:
                    print(f"Error generating caption for image from {image_url}: {e}")

            pbar.update(1)

    return dataset_dict

In [11]:
# Process all images across folders
dataset_dict = process_images(images_ds, model, tokenizer, device)

Processing All Images: 100%|██████████| 1000/1000 [1:03:30<00:00,  3.81s/it]


In [None]:
import time
from datasets import Dataset, Features, Image, Value

# Define dataset features
features = Features({
    "image": Image(),  # Define image field using Hugging Face dataset Image format
    "caption": Value("string")  # Captions as text
})

# Start timing
start_time = time.time()

# Create dataset from dictionary
dataset = Dataset.from_dict(dataset_dict, features=features)


# Push the dataset to the Hugging Face Hub
#dataset.push_to_hub("Subh775/First1000", token="write_your-tokens")

#save the file in csv format
dataset.to_csv("Batch9.csv", index=False)

# Measure elapsed time
elapsed_time = time.time() - start_time
print(f"Time elapsed: {elapsed_time:.2f} seconds")


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Time elapsed: 0.05 seconds


In [None]:
#view the caption generated by the loaded model for the image [index]
#dataset[300]