In [None]:
import torch
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

# Function to load and preprocess an image
def load_and_preprocess_image(image_url):
    image = Image.open(requests.get(image_url, stream=True).raw)
    # Resize the image to the required dimensions for CLIP (224x224)
    image = image.resize((224, 224))
    return image

# Function to check similarity between an image and a text
def check_similarity(image, text, clip_model, clip_processor):
    # Encode the image
    inputs = clip_processor(images=image, return_tensors="pt")
    image_features = clip_model.get_image_features(**inputs)

    # Encode the text
    inputs = clip_processor(text=text, return_tensors="pt")
    text_features = clip_model.get_text_features(**inputs)

    # Calculate cosine similarity between image and text features
    similarity = torch.nn.functional.cosine_similarity(image_features, text_features)
    return similarity.item()

# Sample image URL and text
image_url = "http://images.cocodataset.org/val2014/COCO_val2014_000000159977.jpg"
text_to_compare = "a person riding a horse"

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load and preprocess image
image = load_and_preprocess_image(image_url)

# Check similarity between image and text
similarity_score = check_similarity(image, text_to_compare, clip_model, clip_processor)
print("Similarity score:", similarity_score)


In [None]:
image

In [None]:
check_similarity(image, "a giraffe and zebra", clip_model, clip_processor)

In [16]:
import torch
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel, GPT2LMHeadModel, GPT2Tokenizer

# Function to load and preprocess an image
def load_and_preprocess_image(image_url):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Raise error for invalid URL or unsuccessful download
        image = Image.open(response.raw)
        # Resize the image to the required dimensions for CLIP (224x224)
        image = image.resize((224, 224))
        return image
    except Exception as e:
        print(f"Error loading image: {e}")
        return None

# Function to generate text description for an image
def generate_image_description(image, clip_model, clip_processor, gpt_model, gpt_tokenizer):
    try:
        # Encode the image
        inputs = clip_processor(images=image, return_tensors="pt")
        image_features = clip_model.get_image_features(**inputs)[0]  # Get the image features

        # Generate text description using a separate text generation model (e.g., GPT)
        generated_text = gpt_model.generate(
            features=image_features.unsqueeze(0),  # Ensure the input shape is compatible with GPT-2
            max_length=100,  # Adjust max_length as needed
            pad_token_id=gpt_tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.7,  # Adjust temperature for diversity in generated text
            top_k=50,  # Adjust top_k for diversity in generated text
            top_p=0.95,  # Adjust top_p for diversity in generated text
            num_return_sequences=1
        )

        generated_text = gpt_tokenizer.decode(generated_text[0], skip_special_tokens=True)
        return generated_text
    except Exception as e:
        print(f"Error generating description: {e}")
        return None

# Sample image URL
image_url = "http://images.cocodataset.org/val2014/COCO_val2014_000000159977.jpg"

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load GPT model and tokenizer for text generation
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load and preprocess image
image = load_and_preprocess_image(image_url)

if image:
    # Generate text description for the image
    description = generate_image_description(image, clip_model, clip_processor, gpt_model, gpt_tokenizer)
    if description:
        print("Generated Description:", description)
    else:
        print("Failed to generate description.")
else:
    print("Image loading failed. Check the URL and try again.")


Error generating description: The following `model_kwargs` are not used by the model: ['features'] (note: typos in the generate arguments will also show up in this list)
Failed to generate description.


In [84]:
import torch
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel, GPT2LMHeadModel, GPT2Tokenizer

# Function to load and preprocess an image
def load_and_preprocess_image(image_url):
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Raise error for invalid URL or unsuccessful download
        image = Image.open(response.raw)
        # Resize the image to the required dimensions for CLIP (224x224)
        image = image.resize((224, 224))
        return image
    except Exception as e:
        print(f"Error loading image: {e}")
        return None

# Function to generate text description for an image
def generate_image_description(image, clip_model, clip_processor, gpt_model, gpt_tokenizer):
    try:
        # Encode the image
        inputs = clip_processor(images=image, return_tensors="pt")
        image_features = clip_model.get_image_features(**inputs)[0]  # Get the image features

        # Encode the prompt text
        prompt_text = "A photo of"
        prompt_input = gpt_tokenizer.encode(prompt_text, return_tensors="pt")[0]

        # Convert prompt input to Long type
        prompt_input = prompt_input.long()

        # Concatenate prompt input with image features
        combined_input = torch.cat([prompt_input, image_features.view(-1)])

        print("Image features shape:", image_features.shape)
        print("Prompt input:", prompt_input)
        print("Combined input shape:", combined_input.shape)

        # Generate text description using GPT-2 model
        generated_text = gpt_model.generate(
            input_ids=combined_input.unsqueeze(0),
            max_length=600,  # Adjust max_length
            pad_token_id=gpt_tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.7,  # Adjust temperature for diversity in generated text
            top_k=50,  # Adjust top_k for diversity in generated text
            top_p=0.95,  # Adjust top_p for diversity in generated text
            num_return_sequences=1,
            max_new_tokens=200  # Adjust max_new_tokens to limit the number of generated tokens
        )

        generated_text = gpt_tokenizer.decode(generated_text[0], skip_special_tokens=True)
        return generated_text
    except Exception as e:
        print(f"Error generating description: {e}")
        return None

# Sample image URL
image_url = "http://images.cocodataset.org/val2014/COCO_val2014_000000159977.jpg"

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load GPT-2 model and tokenizer for text generation
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load and preprocess image
image = load_and_preprocess_image(image_url)

if image:
    # Generate text description for the image
    description = generate_image_description(image, clip_model, clip_processor, gpt_model, gpt_tokenizer)
    if description:
        print("Generated Description:", description)
    else:
        print("Failed to generate description.")
else:
    print("Image loading failed. Check the URL and try again.")


Both `max_new_tokens` (=200) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Image features shape: torch.Size([512])
Prompt input: tensor([  32, 4590,  286])
Combined input shape: torch.Size([515])
Error generating description: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
Failed to generate description.


In [None]:
http://images.cocodataset.org/val2014/COCO_val2014_000000159977.jpg