In [None]:
%%capture 
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

# Imports

In [None]:
import numpy as np
import torch
import clip
from tqdm.notebook import tqdm
from pkg_resources import packaging

import torchvision
import torchvision.transforms as t
from torch.utils.data import DataLoader, TensorDataset

print("Torch version:", torch.__version__)

Torch version: 1.13.1+cu116


In [None]:
# cpu / cuda

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Model Initiate

In [None]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [None]:
clip_model, preprocess = clip.load("RN50")

## Dataset

### Image

In [None]:
# clip preprocess
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7fdc0c78ab80>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [None]:
flower = torchvision.datasets.Flowers102(root='/content/',transform=preprocess,download=True,split='train')
print(flower)

train_dataloader = DataLoader(flower, batch_size=32, shuffle=False)

Dataset Flowers102
    Number of datapoints: 1020
    Root location: /content/
    split=train
    StandardTransform
Transform: Compose(
               Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
               CenterCrop(size=(224, 224))
               <function _convert_image_to_rgb at 0x7fdc0c78ab80>
               ToTensor()
               Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
           )


In [None]:
'''save image embedddings (GPU constrained)'''
image_embeddings = []
for idx, (img,label) in enumerate(tqdm(train_dataloader)):
  if idx>=24:
    img = img.to(device)
    op = clip_model.visual(img)
    image_embeddings.append(op)


  0%|          | 0/32 [00:00<?, ?it/s]

### Custom Dataset

In [None]:
train_dataloader = DataLoader(TensorDataset(
                         torch.load('/content/flower_image_emb_1024.pt'),
                         torch.load('/content/flower_labels.pt')), 
           batch_size=32, shuffle=False)

### Text

In [None]:
prompt = "a photo of a {}, a type of flower."

In [None]:
import json
with open("flowers102_cat_to_name.txt",'r') as f:
    mapping = json.load(f)
  
mapping = {int(k)-1:str(v) for k,v in mapping.items()}

In [None]:
text_class = [prompt.format(mapping[i]) for i in sorted(list(mapping.keys()))]

In [None]:
input = clip.tokenize(text_class).to(device)
text_emb = clip_model.encode_text(input)
text_emb.shape

torch.Size([102, 1024])

# Zero Shot

In [None]:
def zeroShot(train_dataloader,clip_model):
  from sklearn.metrics import accuracy_score, f1_score

  acc = []
  f1 = []

  for idx, (img_emb,label) in enumerate(tqdm(train_dataloader)):
    img_emb = img_emb / img_emb.norm(dim=-1, keepdim=True)
    text_emb = text_emb / text_emb.norm(dim=-1, keepdim=True)
    logit_scale = clip_model.logit_scale.exp()
    logits = logit_scale * img_emb @ text_emb.t()
    logits = logits.max(dim=1)

    acc.append(accuracy_score(logits.indices.detach().cpu().numpy(),label.numpy()))
    f1.append(f1_score(logits.indices.detach().cpu().numpy(),label.numpy(),average='macro'))

  print("Accuracy: ",np.mean(acc),"\nF1-score: ", np.mean(f1))

  0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
zeroShot(train_dataloader,clip_model)

Accuracy:  0.6513671875 
F1-score:  0.3341663202349131


# Linear Probe 



*   Image -> Clip image embeddings 
*   Clip Image embeddings -> Linear classification 

### Meta-Learning: Few Shot setting 
Based on paper
1, 2, 4, 8, 16






In [None]:
import random
import torch
import numpy as np

In [None]:
image_embeddings, labels = torch.load('/content/flower_image_emb_1024.pt'), torch.load('/content/flower_labels.pt')

In [None]:
def subset_train(image_embeddings,labels,nshot,index,n_classes,per_class=10):
    np.random.seed(1)
    # randomly choose nshot data points from rest of the data
    index_new = np.array([np.random.choice(list(set(j)-set(index)),nshot,replace=False) for j in [np.arange(i,i+per_class) for i in range(0,labels.shape[0],int(labels.shape[0]/n_classes))]]).flatten()
    '''Train set'''
    train_label = labels[index_new]
    train_img_emb = image_embeddings[index_new]
    print("Train set: ",train_img_emb.shape, train_label.shape)

    # randomize train data
    index_new = torch.randperm(train_img_emb.size(0))
    train_img_emb = train_img_emb[index_new]
    train_label = train_label[index_new]
    return [train_img_emb, train_label]


def subset_data(image_embeddings,labels,nshot = 8,test_size = 2,n_classes = len(cfg.DATASET.CLASSNAMES), per_class = 10 ):
  '''
  Function to subset nshots from Flower Dataset
  '''
  # Create Test set
  # randomly choose test_size data points from data
  np.random.seed(1)
  index = np.array([np.random.choice(j,test_size,replace=False) for j in [np.arange(i,i+per_class) for i in range(0,labels.shape[0],int(labels.shape[0]/n_classes))]]).flatten()
  '''Test set'''
  test_label = labels[index]
  test_img_emb = image_embeddings[index]
  print("Test set: ",test_img_emb.shape, test_label.shape)

  if type(nshot) != list:
    train_img_emb, train_label = subset_train(image_embeddings,labels,nshot,index,n_classes,per_class,)
    return (train_img_emb, train_label, test_img_emb,test_label)
  
  else:
    train = {}
    for n in nshot:
      train[n] = subset_train(image_embeddings,labels,n,index,n_classes,per_class)
    return (train,test_img_emb,test_label)
  
 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

def logistic(train,y_train,test,y_test):
  # define the multinomial logistic regression model
  model = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=1000)
  model.fit(train, y_train)

  yhat = model.predict(test)
  print("Accuracy: ",np.round(accuracy_score(y_test,yhat),2))
  print("F1-score: ",np.round(f1_score(y_test,yhat,average='macro'),2))


In [None]:
for nshot in [1,2,4,8]:
  print("Few shot nshot: ", nshot)
  train_img_emb, train_label, test_img_emb,test_label = subset_data(image_embeddings,labels,nshot = nshot,test_size = 2,n_classes = 102)
  logistic(train_img_emb,train_label,test_img_emb,test_label)
  print("\n")

Few shot nshot:  1
Test set:  (204, 1024) (204,)
Train set:  (102, 1024) (102,)
Accuracy:  0.51
F1-score:  0.45


Few shot nshot:  2
Test set:  (204, 1024) (204,)
Train set:  (204, 1024) (204,)
Accuracy:  0.66
F1-score:  0.63


Few shot nshot:  4
Test set:  (204, 1024) (204,)
Train set:  (408, 1024) (408,)
Accuracy:  0.76
F1-score:  0.72


Few shot nshot:  8
Test set:  (204, 1024) (204,)
Train set:  (816, 1024) (816,)
Accuracy:  0.88
F1-score:  0.87




# Other Datasets