In [1]:
import os
from torchvision.io import read_image
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import logging
import torchvision.transforms.v2 as transforms
import torch

# set logging level
logging.basicConfig(level=logging.DEBUG)

# Dataset and Dataloader for ImageNet-A

In [6]:
from torch.utils.data import Dataset
from PIL import Image
import os
import torch
import logging

class DatasetImageNetA(Dataset):
    def __init__(self, path, transform=None, target_transform=None):
        self.path = path
        self.transform = transform
        self.target_transform = target_transform

        # Read the mapping file
        with open(os.path.join(path, 'README.txt'), 'r') as f:
            lines = f.readlines()[12:]

        # Create the mapping dictionary
        mapping = {}
        for line in lines:
            if line.strip():
                parts = line.strip().split()
                numeric_id = parts[0]
                name = ' '.join(parts[1:]).lower()
                mapping[numeric_id[1:]] = name

        # Create the labels list
        tokens = []
        for cl in mapping.keys():
            for file_name in os.listdir(os.path.join(path, f"n{str(cl)}")):
                tokens.append((cl, file_name))

        self.mapping = mapping
        self.tokens = tokens
        self.classes = mapping.values()

        logging.debug(f"Number of images: {len(self.tokens)}")
        logging.debug(f"Number of classes: {len(self.mapping)}")

        if len(self.tokens) != 7500:
            logging.error("The dataset is not correct")
            raise ValueError(f"The dataset is not correct: {len(self.tokens)} images and {len(self.mapping)} classes")
        
        if len(self.mapping) != 200:
            logging.error("The dataset is not correct")
            raise ValueError(f"The dataset is not correct: {len(self.tokens)} images and {len(self.mapping)} classes") 
          

    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, idx):
        cl, file_name = self.tokens[idx]
        img_path = os.path.join(self.path, f"n{str(cl)}", file_name)
        
        # Load the image as a PIL image
        img = Image.open(img_path).convert('RGB')
        
        # Convert class label to tensor
        target = torch.tensor(int(cl))

        # # Apply transformations if provided
        # if self.transform:
        #     img = self.transform(img)
        
        # if self.target_transform:
        #     target = self.target_transform(target)
        
        return img, target



In [10]:
dataset = DatasetImageNetA(
    "dataset/imagenet-a",
    # transform=transforms.Compose(
    #     [transforms.Resize((300, 300)), transforms.ToTensor()]
    # ) 
)


DEBUG:root:Number of images: 7500
DEBUG:root:Number of classes: 200


200

In [30]:
## Test the dataset

# shapes = {}

# for img in imageneta:
#     shapes[img[0].shape] = shapes.get(img[0].shape, 0) + 1
# shapes

# Dataloader with batch loading
It manages images in rgb and b&w (copying the data in the missing dimension).

In [31]:
def collate_fn(batch):
    images = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    # Check the number of channels for each image
    num_channels = [img.shape[0] for img in images]

    # Convert images to 3 channels if necessary
    for i, img in enumerate(images):
        if num_channels[i] == 4:
            images[i] = img[:3]  # Take the first 3 channels
        elif num_channels[i] == 1:
            images[i] = img.repeat(3, 1, 1)  # Convert grayscale to 3 channels

    # Stack the images into a batch
    batch_images = torch.stack(images, dim=0)

    # Move the batch to the GPU
    batch_images = batch_images.cuda()
    batch_labels = torch.tensor(labels).cuda()

    return batch_images, batch_labels

# train_loader = DataLoader(dataset, shuffle=True, batch_size=1, collate_fn=collate_fn)


In [32]:
# if logging.getLogger().isEnabledFor(logging.DEBUG):
#     logging.debug("Checking if the collate function works correctly")
#     for index, data in enumerate(train_loader):
#         # print(index, data[0].shape, data[1].shape)
#         pass
#     logging.debug("Collate function works correctly")

# 

In [11]:
import os
import clip
import torch
from torchvision.datasets import CIFAR100


In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

In [13]:
dataset = DatasetImageNetA("dataset/imagenet-a")

DEBUG:root:Number of images: 7500
DEBUG:root:Number of classes: 200


In [14]:
image, class_id = dataset[0]
image_input = preprocess(image).unsqueeze(0).to(device)

In [16]:
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in dataset.mapping.keys()]).to(device)

In [19]:
with torch.no_grad():
    image_features = model.encode_image(image_input)
    # text_features = model.encode_text(text_inputs)

In [20]:
text_inputs.shape

torch.Size([200, 77])

In [21]:
# text_inputs = torch.cat([clip.tokenize(f"a photo of a {dataset.mapping[c[0]]}") for c in dataset.labels]).to(device)

# text_features = []
# with torch.no_grad():
#     for i in range(0, len(text_inputs), batch_size):
#         batch = text_inputs[i:i+batch_size]
#         batch_features = model.encode_text(batch).half()
#         text_features.append(batch_features)

# text_features = torch.cat(text_features, dim=0)


text_inputs = torch.cat([clip.tokenize(f"a photo of a {dataset.mapping[c[0]]}") for c in dataset.tokens]).to('cuda')
# model.to('cpu')
with torch.no_grad():
    text_features = model.encode_text(text_inputs).half()

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.20 GiB. GPU 

In [9]:

text_inputs = torch.cat([clip.tokenize(f"a photo of a {dataset.mapping[c[0]]}") for c in dataset.tokens]).to(device)
with torch.no_grad():
    text_features = model.encode_text(text_inputs)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.20 GiB. GPU 

In [35]:
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Download the dataset
# cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)

# Prepare the inputs
image, class_id = dataset[0]
image_input = preprocess(image).unsqueeze(0).to(device)
# image_input = image.unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {dataset.mapping[c[0]]}") for c in dataset.tokens]).to(device)

# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)

# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{dataset.mapping[index]:>16s}: {100 * value.item():.2f}%")

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.10 GiB. GPU 

In [26]:
# import torch
# import torchvision.transforms as T

# device = "cuda" if torch.cuda.is_available() else "cpu"
# model, preprocess = clip.load('RN50', device)
# model = model.cuda().eval()


# # Define the custom preprocessing for tensor input
# def preprocess_tensor(tensor_image):
#     # Convert the tensor to float and scale to [0, 1]
#     tensor_image = tensor_image.float() / 255.0
    
#     preprocess = T.Compose([
#         T.Resize(32),  # Resize to 224x224, the size expected by ViT-B/32
#         T.CenterCrop(32),  # Crop to exactly 224x224
#         T.Normalize(mean=(0.48145466, 0.4578275, 0.40821073),  # Normalize with CLIP's specific mean and std
#                     std=(0.26862954, 0.26130258, 0.27577711))
#     ])
#     return preprocess(tensor_image)

# # Prepare the inputs
# image, class_id = dataset[0]

# # Ensure the image has 3 channels
# if image.shape[0] == 1:  # Grayscale
#     image = image.repeat(3, 1, 1)
# elif image.shape[0] == 4:  # More than 3 channels, trim to 3
#     image = image[:3, :, :]

# # Apply preprocessing
# image_input = preprocess_tensor(image.unsqueeze(0)).to(device)
# # print(image_input.shape)

# # # Process the text inputs as before
# text_inputs = torch.cat([clip.tokenize(f"a photo of a {dataset.mapping[c[0]]}") for c in dataset.labels]).to(device)

# # # Calculate features
# with torch.no_grad():
# #     image_features = model.encode_image(image_input)
#     text_features = model.encode_text(text_inputs)

# # # Pick the top 5 most similar labels for the image
# # image_features /= image_features.norm(dim=-1, keepdim=True)
# # text_features /= text_features.norm(dim=-1, keepdim=True)
# # similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
# # values, indices = similarity[0].topk(5)

# # # Print the result
# # print("\nTop predictions:\n")
# # for value, index in zip(values, indices):
# #     print(f"{dataset.mapping[index]:>16s}: {100 * value.item():.2f}%")


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.65 GiB. GPU 