### Dataset: RESISC45

In [3]:
from huggingface_hub import hf_hub_download

for model_name in ['RN50', 'ViT-B-32', 'ViT-L-14']:
    checkpoint_path = hf_hub_download("chendelong/RemoteCLIP", f"RemoteCLIP-{model_name}.pt", cache_dir='checkpoints')
    print(f'{model_name} is downloaded to {checkpoint_path}.')

RemoteCLIP-RN50.pt:   0%|          | 0.00/408M [00:00<?, ?B/s]

RN50 is downloaded to checkpoints/models--chendelong--RemoteCLIP/snapshots/bf1d8a3ccf2ddbf7c875705e46373bfe542bce38/RemoteCLIP-RN50.pt.


RemoteCLIP-ViT-B-32.pt:   0%|          | 0.00/605M [00:00<?, ?B/s]

ViT-B-32 is downloaded to checkpoints/models--chendelong--RemoteCLIP/snapshots/bf1d8a3ccf2ddbf7c875705e46373bfe542bce38/RemoteCLIP-ViT-B-32.pt.


RemoteCLIP-ViT-L-14.pt:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

ViT-L-14 is downloaded to checkpoints/models--chendelong--RemoteCLIP/snapshots/bf1d8a3ccf2ddbf7c875705e46373bfe542bce38/RemoteCLIP-ViT-L-14.pt.


In [4]:
import torch, open_clip

model_name = 'ViT-B-32'
model, _, preprocess = open_clip.create_model_and_transforms(model_name)
tokenizer = open_clip.get_tokenizer(model_name)

ckpt = torch.load(f"checkpoints/models--chendelong--RemoteCLIP/snapshots/bf1d8a3ccf2ddbf7c875705e46373bfe542bce38/RemoteCLIP-{model_name}.pt", map_location="cpu")
message = model.load_state_dict(ckpt)
print(message)

<All keys matched successfully>


In [5]:
import os
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from collections import OrderedDict
import torch
from torchvision import transforms, models, datasets
from tqdm import tqdm

In [6]:
# Define transform and add image resolution
transform = transforms.Compose([preprocess])

# Load RESISC dataset
data_dir = '/data/scratch/public/resisc45'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), transform) for x in ['train', 'test']}
dataloaders = {
    'train': torch.utils.data.DataLoader(image_datasets['train'], batch_size=400, shuffle=True, num_workers=20),
    'test': torch.utils.data.DataLoader(image_datasets['test'], batch_size=400, shuffle=False, num_workers=20)
}

# Get classes
classes = image_datasets['test'].classes

In [7]:
# text descriptions
templates = [
    '{}',
    '{}.',
    'a photo of a {}.',
    'an image of a {}.',
    'a color image of a {}.',
    'a remote sensing image of many {}.',
    'a remote sensing image of a {}.',
    'a remote sensing image of the {}.',
    'a remote sensing image of the hard to see {}.',
    'a remote sensing image of a hard to see {}.',
    'a low resolution remote sensing image of the {}.',
    'a low resolution remote sensing image of a {}.',
    'a bad remote sensing image of the {}.',
    'a bad remote sensing image of a {}.',
    'a cropped remote sensing image of the {}.',
    'a cropped remote sensing image of a {}.',
    'a bright remote sensing image of the {}.',
    'a bright remote sensing image of a {}.',
    'a dark remote sensing image of the {}.',
    'a dark remote sensing image of a {}.',
    'a close-up remote sensing image of the {}.',
    'a close-up remote sensing image of a {}.',
    'a black and white remote sensing image of the {}.',
    'a black and white remote sensing image of a {}.',
    'a jpeg corrupted remote sensing image of the {}.',
    'a jpeg corrupted remote sensing image of a {}.',
    'a blurry remote sensing image of the {}.',
    'a blurry remote sensing image of a {}.',
    'a good remote sensing image of the {}.',
    'a good remote sensing image of a {}.',
    'a remote sensing image of the large {}.',
    'a remote sensing image of a large {}.',
    'a remote sensing image of the nice {}.',
    'a remote sensing image of a nice {}.',
    'a remote sensing image of the small {}.',
    'a remote sensing image of a small {}.',
    'a remote sensing image of the weird {}.',
    'a remote sensing image of a weird {}.',
    'a remote sensing image of the cool {}.',
    'a remote sensing image of a cool {}.',
    'an aerial image of many {}.',
    'an aerial image of a {}.',
    'an aerial image of the {}.',
    'an aerial image of the hard to see {}.',
    'an aerial image of a hard to see {}.',
    'a low resolution aerial image of the {}.',
    'a low resolution aerial image of a {}.',
    'a bad aerial image of the {}.',
    'a bad aerial image of a {}.',
    'a cropped aerial image of the {}.',
    'a cropped aerial image of a {}.',
    'a bright aerial image of the {}.',
    'a bright aerial image of a {}.',
    'a dark aerial image of the {}.',
    'a dark aerial image of a {}.',
    'a close-up aerial image of the {}.',
    'a close-up aerial image of a {}.',
    'a black and white aerial image of the {}.',
    'a black and white aerial image of a {}.',
    'a jpeg corrupted aerial image of the {}.',
    'a jpeg corrupted aerial image of a {}.',
    'a blurry aerial image of the {}.',
    'a blurry aerial image of a {}.',
    'a good aerial image of the {}.',
    'a good aerial image of a {}.',
    'an aerial image of the large {}.',
    'an aerial image of a large {}.',
    'an aerial image of the nice {}.',
    'an aerial image of a nice {}.',
    'an aerial image of the small {}.',
    'an aerial image of a small {}.',
    'an aerial image of the weird {}.',
    'an aerial image of a weird {}.',
    'an aerial image of the cool {}.',
    'an aerial image of a cool {}.',
    'a satellite image of many {}.',
    'a satellite image of a {}.',
    'a satellite image of the {}.',
    'a satellite image of the hard to see {}.',
    'a satellite image of a hard to see {}.',
    'a low resolution satellite image of the {}.',
    'a low resolution satellite image of a {}.',
    'a bad satellite image of the {}.',
    'a bad satellite image of a {}.',
    'a cropped satellite image of the {}.',
    'a cropped satellite image of a {}.',
    'a bright satellite image of the {}.',
    'a bright satellite image of a {}.',
    'a dark satellite image of the {}.',
    'a dark satellite image of a {}.',
    'a close-up satellite image of the {}.',
    'a close-up satellite image of a {}.',
    'a black and white satellite image of the {}.',
    'a black and white satellite image of a {}.',
    'a jpeg corrupted satellite image of the {}.',
    'a jpeg corrupted satellite image of a {}.',
    'a blurry satellite image of the {}.',
    'a blurry satellite image of a {}.',
    'a good satellite image of the {}.',
    'a good satellite image of a {}.',
    'a satellite image of the large {}.',
    'a satellite image of a large {}.',
    'a satellite image of the nice {}.',
    'a satellite image of a nice {}.',
    'a satellite image of the small {}.',
    'a satellite image of a small {}.',
    'a satellite image of the weird {}.',
    'a satellite image of a weird {}.',
    'a satellite image of the cool {}.',
    'a satellite image of a cool {}.',
]

In [8]:
accs = []
recalls = []
model = model.cuda()

for template in templates:
    text_descriptions = [template.format(label) for label in classes]
    model.eval()

    # Lists to store embeddings, images, and labels
    embeddings = []
    images = []
    image_labels = []

    # Iterate through the test dataloader
    for inputs, labels in tqdm(dataloaders['test']):
        inputs = inputs.to('cuda')

        # Forward pass to get embeddings
        with torch.no_grad():
            features = model.encode_image(inputs).detach().cpu()
            features /= features.norm(dim=-1, keepdim=True)

        # Store image embeddings and labels
        images.extend(inputs.cpu().numpy())
        embeddings.extend(features.numpy())
        image_labels.extend(labels.cpu().numpy())

    # Convert lists to numpy arrays
    embeddings_array = np.array(embeddings)
    image_labels_array = np.array(image_labels)

    # Tokenize text descriptions
    text_tokens = open_clip.tokenize(text_descriptions).cuda()

    # Convert image embeddings to PyTorch tensor
    embeddings_tensor = torch.tensor(embeddings_array)

    # Move the tensor to GPU if CUDA is available
    if torch.cuda.is_available():
        embeddings_tensor = embeddings_tensor.to('cuda')

    with torch.no_grad():
        # Encode text features
        text_features = model.encode_text(text_tokens).float()
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # Ensure data types are consistent
        if text_features.dtype == torch.float32:
            embeddings_tensor = embeddings_tensor.to(torch.float32)

    # Calculate text probabilities
    text_probs = (100.0 * embeddings_tensor @ text_features.T).softmax(dim=-1)
    top_probs, top_labels = text_probs.cpu().topk(5, dim=-1)

    # Convert image labels to PyTorch tensor
    true_labels_tensor = torch.tensor(image_labels_array, dtype=torch.long)

    # Ensure that both tensors are on the CPU
    predicted = top_labels[:, 0].cpu()
    true_labels = true_labels_tensor.cpu()
    correct_predictions = torch.sum(predicted == true_labels)

    # Calculate accuracy
    acc = correct_predictions.item() / len(image_labels)
    accs.append(acc)
    print(f"\nText #{len(accs)} => Accuracy: {acc * 100:.2f}%")
    print()

    # Calculate mean recall
    true_positives = torch.sum((predicted == true_labels) & (predicted == 1)).item()
    actual_positives = torch.sum(true_labels == 1).item()
    recalls.append(true_positives / actual_positives)

# Print average accuracy
avg_acc = sum(accs) / len(accs)
highest_acc = max(accs)
lowest_acc = min(accs)
print(f"\nAverage Accuracy over {len(accs)} different texts: {avg_acc * 100:.2f}%")
print(f"Highest Accuracy: {highest_acc * 100:.2f}%")
print(f"Lowest Accuracy: {lowest_acc * 100:.2f}%")

# Print mean recall
mean_recall = sum(recalls) / len(recalls)
print(f"\nMean Recall: {mean_recall * 100:.2f}%")

100%|██████████| 10/10 [00:47<00:00,  4.75s/it]



Text #1 => Accuracy: 83.43%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #2 => Accuracy: 83.84%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #3 => Accuracy: 82.22%



100%|██████████| 10/10 [00:05<00:00,  1.88it/s]



Text #4 => Accuracy: 82.38%



100%|██████████| 10/10 [00:05<00:00,  1.89it/s]



Text #5 => Accuracy: 84.33%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #6 => Accuracy: 82.46%



100%|██████████| 10/10 [00:05<00:00,  1.89it/s]



Text #7 => Accuracy: 81.09%



100%|██████████| 10/10 [00:05<00:00,  1.90it/s]



Text #8 => Accuracy: 82.33%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #9 => Accuracy: 82.28%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #10 => Accuracy: 81.79%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #11 => Accuracy: 81.97%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #12 => Accuracy: 82.56%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #13 => Accuracy: 81.43%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #14 => Accuracy: 81.71%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #15 => Accuracy: 82.04%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #16 => Accuracy: 82.12%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #17 => Accuracy: 82.30%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #18 => Accuracy: 81.45%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #19 => Accuracy: 83.25%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #20 => Accuracy: 82.30%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #21 => Accuracy: 82.07%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #22 => Accuracy: 82.40%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #23 => Accuracy: 84.38%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #24 => Accuracy: 84.18%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #25 => Accuracy: 84.36%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #26 => Accuracy: 84.54%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #27 => Accuracy: 81.56%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #28 => Accuracy: 82.33%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #29 => Accuracy: 81.51%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #30 => Accuracy: 81.35%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #31 => Accuracy: 82.71%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #32 => Accuracy: 82.69%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #33 => Accuracy: 82.82%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #34 => Accuracy: 83.30%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #35 => Accuracy: 79.86%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #36 => Accuracy: 80.09%



100%|██████████| 10/10 [00:05<00:00,  1.78it/s]



Text #37 => Accuracy: 79.96%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #38 => Accuracy: 80.30%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #39 => Accuracy: 81.27%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #40 => Accuracy: 82.79%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #41 => Accuracy: 83.43%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #42 => Accuracy: 82.87%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #43 => Accuracy: 83.33%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #44 => Accuracy: 82.74%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #45 => Accuracy: 82.28%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #46 => Accuracy: 81.61%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #47 => Accuracy: 80.58%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #48 => Accuracy: 83.12%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #49 => Accuracy: 81.12%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #50 => Accuracy: 82.20%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #51 => Accuracy: 81.27%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #52 => Accuracy: 82.87%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #53 => Accuracy: 81.56%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #54 => Accuracy: 84.00%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #55 => Accuracy: 83.48%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #56 => Accuracy: 82.51%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #57 => Accuracy: 81.66%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #58 => Accuracy: 84.31%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #59 => Accuracy: 83.59%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #60 => Accuracy: 82.53%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #61 => Accuracy: 82.07%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #62 => Accuracy: 81.81%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #63 => Accuracy: 80.38%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #64 => Accuracy: 82.66%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #65 => Accuracy: 81.40%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #66 => Accuracy: 82.02%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #67 => Accuracy: 81.97%



100%|██████████| 10/10 [00:05<00:00,  1.87it/s]



Text #68 => Accuracy: 82.35%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #69 => Accuracy: 81.63%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #70 => Accuracy: 81.15%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #71 => Accuracy: 77.88%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #72 => Accuracy: 82.33%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #73 => Accuracy: 81.07%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #74 => Accuracy: 82.79%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #75 => Accuracy: 80.17%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #76 => Accuracy: 84.25%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #77 => Accuracy: 82.97%



100%|██████████| 10/10 [00:05<00:00,  1.73it/s]



Text #78 => Accuracy: 83.28%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #79 => Accuracy: 83.46%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #80 => Accuracy: 83.25%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #81 => Accuracy: 82.43%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #82 => Accuracy: 82.40%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #83 => Accuracy: 83.20%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #84 => Accuracy: 82.43%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #85 => Accuracy: 82.48%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #86 => Accuracy: 82.74%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #87 => Accuracy: 83.05%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #88 => Accuracy: 82.79%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #89 => Accuracy: 84.20%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #90 => Accuracy: 84.00%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #91 => Accuracy: 82.92%



100%|██████████| 10/10 [00:05<00:00,  1.80it/s]



Text #92 => Accuracy: 82.69%



100%|██████████| 10/10 [00:05<00:00,  1.81it/s]



Text #93 => Accuracy: 83.28%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #94 => Accuracy: 82.89%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #95 => Accuracy: 83.05%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #96 => Accuracy: 82.94%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #97 => Accuracy: 82.38%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #98 => Accuracy: 81.86%



100%|██████████| 10/10 [00:05<00:00,  1.86it/s]



Text #99 => Accuracy: 83.07%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #100 => Accuracy: 82.69%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #101 => Accuracy: 82.87%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #102 => Accuracy: 82.56%



100%|██████████| 10/10 [00:05<00:00,  1.87it/s]



Text #103 => Accuracy: 83.15%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #104 => Accuracy: 82.53%



100%|██████████| 10/10 [00:05<00:00,  1.85it/s]



Text #105 => Accuracy: 80.55%



100%|██████████| 10/10 [00:05<00:00,  1.84it/s]



Text #106 => Accuracy: 79.78%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #107 => Accuracy: 82.74%



100%|██████████| 10/10 [00:05<00:00,  1.83it/s]



Text #108 => Accuracy: 81.76%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]



Text #109 => Accuracy: 82.94%



100%|██████████| 10/10 [00:05<00:00,  1.82it/s]


Text #110 => Accuracy: 81.56%


Average Accuracy over 110 different texts: 82.36%
Highest Accuracy: 84.54%
Lowest Accuracy: 77.88%

Mean Recall: 70.31%



