In [1]:
# Install needed packages

!pip install open_clip_torch torchvision torch scikit-learn tqdm

Collecting open_clip_torch
  Downloading open_clip_torch-2.32.0-py3-none-any.whl.metadata (31 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (fr

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import open_clip
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm

### **Load the CLIP model**

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms(
    'ViT-B-32', pretrained='laion2b_s34b_b79k'
)
tokenizer = open_clip.get_tokenizer('ViT-B-32')
model.to(device)
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

### **Load CIFAR10 dataset**

In [4]:
# Load CIFAR-10 data
transform = preprocess
trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)



100%|██████████| 170M/170M [00:12<00:00, 13.2MB/s]


In [5]:
# CIFAR-10 labels
# first_method_cifar10_classes = [
#     "airplane", "automobile", "bird", "cat", "deer",
#     "dog", "frog", "horse", "ship", "truck"
# ]

# second_method_cifar10_classes = [
#     "a photo of an airplane", "a photo of an automobile", "a photo of a bird",\
#     "a photo of a cat", "a photo of a deer", "a photo of a dog", \
#     "a photo of a frog", "a photo of a horse", "a photo of a ship",\
#     "a photo of a truck"
# ]

third_method_cifar10_classes = ["a realistic picture of an aircraft",\
                                "a modern vehicle on the road",\
                                "an image of a flying bird", "an image of a pet cat",\
                                "an image of a deer standing in grass",\
                                "a realistic photo of a dog",\
                                "a close-up of a small frog",\
                                "a wild horse in nature",\
                                "an image of a ship at sea",\
                                "a realistic picture of a truck"]

In [6]:
cifar10_classes = third_method_cifar10_classes

### **Extract text features**

In [7]:
text_inputs = tokenizer([f"{c}" for c in cifar10_classes]).to(device)
with torch.no_grad():
    text_features = model.encode_text(text_inputs)
    text_features /= text_features.norm(dim=-1, keepdim=True)


### **Extract image features**

In [8]:
def extract_features(dataloader):
    image_features = []
    labels = []

    with torch.no_grad():
        for images, targets in tqdm(dataloader, desc="Extracting Features"):
            images = images.to(device)
            features = model.encode_image(images).cpu()
            image_features.append(features)
            labels.extend(targets)

    return torch.cat(image_features), torch.tensor(labels)

In [9]:
print("Extracting training set features")
train_features, train_labels = extract_features(trainloader)

print("Extracting test set features")
test_features, test_labels = extract_features(testloader)

Extracting training set features


Extracting Features: 100%|██████████| 782/782 [04:05<00:00,  3.18it/s]


Extracting test set features


Extracting Features: 100%|██████████| 157/157 [00:45<00:00,  3.48it/s]


In [10]:
# Normalize features
train_features = torch.nn.functional.normalize(train_features, dim=-1)
test_features = torch.nn.functional.normalize(test_features, dim=-1)

### **Train linear probe**

In [11]:
print("Training linear classifier")
clf = LogisticRegression(random_state=0, C=0.316, max_iter=1000,\
                         multi_class='multinomial', solver='lbfgs', verbose=1)
clf.fit(train_features.numpy(), train_labels.numpy())


Training linear classifier




### **Evaluate linear probe**

In [12]:
linear_preds = clf.predict(test_features.numpy())
linear_acc = accuracy_score(test_labels.numpy(), linear_preds)

### **Evaluate zero-shot**

In [13]:
def zero_shot_predict(features, text_features):
    sims = features @ text_features.cpu().T
    return sims.argmax(dim=1)


In [14]:
zero_shot_preds = zero_shot_predict(test_features, text_features)
zero_shot_acc = accuracy_score(test_labels.numpy(), zero_shot_preds.numpy())

### **Final results**

In [15]:
print("Results:")
print(f"Zero-shot accuracy: {zero_shot_acc * 100:.2f}%")
print(f"Linear probe accuracy: {linear_acc * 100:.2f}%")

Results:
Zero-shot accuracy: 87.70%
Linear probe accuracy: 96.51%
