In [None]:
# Using OpenAI's CLIP model
pip install git+https://github.com/openai/CLIP.git

In [2]:
# Test Directory
test_dir ="/kaggle/input/vlg-recruitment-24-challenge/vlg-dataset/vlg-dataset/test"

In [3]:
# Importing Necessary Libraries
import os
import pandas as pd
import torch
import clip
from PIL import Image

In [None]:
# Compiling Predicted Classes in a file
# Initialize a list to store the prediction results
data = []

# Iterate over all files in the test directory
for filename in os.listdir(test_dir):
    # Check if the file is an image (based on common image extensions)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')):
        # Get the full path of the image
        image_path = os.path.join(test_dir, filename)
        # Check if a GPU is available, otherwise use CPU for computations
        device = "cuda" if torch.cuda.is_available() else "cpu"
        # Load the CLIP model (ViT-B/32 architecture) and its preprocessing pipeline
        model, preprocess = clip.load("ViT-B/32", device=device)
         # Preprocess the image (resize, normalize, etc.) and prepare it as a batch
        image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
        # Define the set of class labels to classify the images into
        labels =["antelope", "grizzly bear", "killer whale", "beaver", "dalmatian", "persian cat", "horse", "german shepherd", "blue whale", "siamese cat", "skunk", "mole", "tiger", "hippopotamus", "leopard", "moose", "spider monkey", "humpback whale", "elephant", "gorilla", "ox", "fox", "sheep", "seal", "chimpanzee", "hamster", "squirrel", "rhinoceros", "rabbit", "bat", "giraffe", "wolf", "chihuahua", "rat", "weasel", "otter", "buffalo", "zebra", "giant panda", "deer", "bobcat", "pig", "lion", "mouse", "polar bear", "collie", "walrus", "raccoon", "cow", "dolphin"]
        # Tokenize the text labels (convert them into embeddings the model can understand)
        text = clip.tokenize(["antelope", "grizzly bear", "killer whale", "beaver", "dalmatian", "persian cat", "horse", "german shepherd", "blue whale", "siamese cat", "skunk", "mole", "tiger", "hippopotamus", "leopard", "moose", "spider monkey", "humpback whale", "elephant", "gorilla", "ox", "fox", "sheep", "seal", "chimpanzee", "hamster", "squirrel", "rhinoceros", "rabbit", "bat", "giraffe", "wolf", "chihuahua", "rat", "weasel", "otter", "buffalo", "zebra", "giant panda", "deer", "bobcat", "pig", "lion", "mouse", "polar bear", "collie", "walrus", "raccoon", "cow", "dolphin"]).to(device)
        # Perform inference using the CLIP model
        with torch.no_grad():
            # Compute image features
            image_features = model.encode_image(image)
            # Compute text features
            text_features = model.encode_text(text)
            # Calculate similarity scores between the image and text labels
            logits_per_image, logits_per_text = model(image, text)
        # Compute probabilities from the similarity scores
        preds = logits_per_image.softmax(dim=-1).cpu().numpy()
        # Get the class label with the highest probability
        predicted_class = labels[preds.argmax()]
        print({'image_id': filename, 'class': predicted_class})
        data.append({'image_id': filename, 'class': predicted_class})
# Create a Pandas DataFrame from the prediction results
submission = pd.DataFrame(data)
# Save the predictions to a CSV file for submission
submission.to_csv('/kaggle/working/submission.csv', index=False)