In [5]:
!unrar x dataset.rar


UNRAR 6.11 beta 1 freeware      Copyright (c) 1993-2022 Alexander Roshal


Extracting from dataset.rar

Creating    dataset                                                   OK
Creating    dataset/english                                           OK
Extracting  dataset/english/1.png                                          0%  OK 
Extracting  dataset/english/10.png                                         0%  OK 
Extracting  dataset/english/100.png                                        0%  OK 
Extracting  dataset/english/1003.png                                       0%  OK 
Extracting  dataset/english/1004.png                                       0%  OK 
Extracting  dataset/english/1006.png                                       0%  OK 
Extracting  dataset/english/1007.png                                       0%  OK 
Extracting  dataset/english/1009.png                                       0%  OK 
Extracting  d

In [6]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-_i1by4x8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-_i1by4x8
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [7]:
import os
import argparse
import warnings
from PIL import Image, ImageDraw
from torchvision import transforms as T
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset, Subset
from torch import nn
import torch
import clip
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from tkinter import filedialog
from tqdm import tqdm
from sklearn.model_selection import train_test_split

warnings.simplefilter("ignore")

torch.autograd.set_detect_anomaly(True)

# TODO:

# 0. implement checkpoints - saving and loading in functions
# 1. implement the validation set wala stuff - if args.val_dir not None then: blabla
# 2. implement cosine annealing schedule with decay
# 3. implement learning rate warmup trick from 1e-5 (? the place where you read in coop it was with sgd)
# 4. in general, read clip finetuning papers and references to figure out best way to do this

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7deb5ede9db0>

# Utils

In [56]:
# https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model):
    for p in model.parameters():
          p.data = p.data.float()
          if p.grad is not None:
              p.grad.data = p.grad.data.float()

def convert_back_to_fp16(args, model):
  if args.method == 'base':
    clip.model.convert_weights(model)
  elif args.method == 'coop':
    clip.model.convert_weights(model)
    model.trainable_param.data = model.trainable_param.data.to(torch.float16)
  else:
    assert False, "Not Implemented"


def add_coop_approach(args, model):
    # ishaan: coop: add a parameter to the model to be pre-pended after tokenization.
    # TODO:
    # 1. initialise with a variance of 0.02 / 0.2, whatever is in that paper.
    ctx_vecs = torch.empty(args.n_ctx, model.transformer.width)
    nn.init.normal_(ctx_vecs, std=0.02)

    trainable_param = torch.nn.Parameter(ctx_vecs)
    model.register_parameter('trainable_param', trainable_param)
    def encode_text_coop(text):
        x = model.token_embedding(text).type(model.dtype)  # [batch_size, n_ctx, d_model]

        context_embedding = model.trainable_param.unsqueeze(0).repeat(x.shape[0], 1, 1)

        x = torch.cat((x[:, :1], context_embedding, x[:, 1+context_embedding.shape[1]:]), dim=1)
        x = x + model.positional_embedding.type(model.dtype)
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = model.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = model.ln_final(x).type(model.dtype)
        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ model.text_projection

        return x
    model.encode_text_coop = encode_text_coop
    return model


def train_val_dataset(dataset, val_split=0.25):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['val'] = Subset(dataset, val_idx)
    return datasets['train'], datasets['val']


In [57]:

def accuracy_clip_base(args, model, preprocess, loader, classification_strings):
    correct, total = 0, 0
    model.eval()
    with torch.no_grad():
      for batch in tqdm(loader):
          images, labels = batch
          images, labels = images.to(args.device), labels.to(args.device)
          texts = clip.tokenize(classification_strings).to(args.device)


          # logits_per_image, logits_per_text = model(images, texts)
          # get image features
          image_features = model.encode_image(images)
          image_features = image_features / image_features.norm(dim=-1, keepdim=True)

          # get text features
          class_features = model.encode_text(texts)
          similarity = (100 * image_features @ class_features.T).softmax(dim=-1)
          predictions = torch.argmax(similarity, dim=1)
          total += similarity.shape[0]
          correct += int(sum(predictions == labels))

    return correct / total

def accuracy_clip_coop(args, model, preprocess, loader, classification_strings):
    correct, total = 0, 0
    model.eval()
    with torch.no_grad():
      for batch in tqdm(loader):
          images, labels = batch
          images, labels = images.to(args.device), labels.to(args.device)
          texts = clip.tokenize(classification_strings).to(args.device)


          # logits_per_image, logits_per_text = model(images, texts)
          # get image features
          image_features = model.encode_image(images)
          image_features = image_features / image_features.norm(dim=-1, keepdim=True)

          # get text features
          class_features = model.encode_text_coop(texts) # using the modded text encoder
          similarity = (100 * image_features @ class_features.T).softmax(dim=-1)
          predictions = torch.argmax(similarity, dim=1)
          total += similarity.shape[0]
          correct += int(sum(predictions == labels))

    return correct / total


In [58]:

def train_clip_base(args, model, preprocess, loader, optimizer, criterion, classification_strings):
    losses = []
    model.train()
    for batch in tqdm(loader):
        optimizer.zero_grad()
        images, labels = batch
        images, labels = images.to(args.device), labels.to(args.device)
        texts = clip.tokenize(classification_strings).to(args.device)

        # get image features
        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # get text features
        class_features = model.encode_text(texts) # using the modded text encoder
        similarity = (100 * image_features @ class_features.T).softmax(dim=-1)
        predictions = torch.argmax(similarity, dim=1)

        loss = nn.CrossEntropyLoss()(similarity, labels)
        loss.backward()
        losses.append(loss.item())
        if args.device == "cpu":
            optimizer.step()
        else :
            convert_models_to_fp32(model)
            optimizer.step()
            convert_back_to_fp16(args, model)
    return losses

def train_clip_coop(args, model, preprocess, loader, optimizer, criterion, classification_strings):
    losses = []
    model.train()
    for batch in tqdm(loader):
        optimizer.zero_grad()
        images, labels = batch
        images, labels = images.to(args.device), labels.to(args.device)
        texts = clip.tokenize(classification_strings).to(args.device)

        # get image features
        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # get text features
        class_features = model.encode_text_coop(texts) # using the modded text encoder
        similarity = (100 * image_features @ class_features.T).softmax(dim=-1)
        predictions = torch.argmax(similarity, dim=1)

        loss = nn.CrossEntropyLoss()(similarity, labels)
        loss.backward()
        losses.append(loss.item())
        if args.device == "cpu":
            optimizer.step()
        else :
            convert_models_to_fp32(model)
            optimizer.step()
            convert_back_to_fp16(args, model)
    return losses


In [59]:
def save_checkpoint(args, model, epoch):
  torch.save(model.state_dict(), os.path.join(args.checkpoint_dir), args.checkpoint_name + str(epoch) + '.pt')

# training

In [None]:
# TODO:
# 1. base clip should not finetune only measure final performance maybe
# 2. organise every method-specific thing into method functions. go through code and spot these. for example, add_coop_approach should be wrapped inside mod_model and accuracy_clip_coop should be wrapped inside get_accuracy.
# also, optimizer wala line ko bhi consider in 2.

# 3. add conditionals based on the approach. but dont take it too far in step 2.
# 4. add functionality to load from checkpoints
# 5. implement final accuracy and stats printing and eval at the end. in a function maybe? idk

def main(args):
    model, preprocess = clip.load(args.clip_model_name, device=args.device, jit=False)
    model = add_coop_approach(args, model)
    convert_back_to_fp16(args, model)
    model = model.to(args.device)
    dataset = ImageFolder(args.dataset_dir, transform=preprocess)
    train_dataset, val_dataset = train_val_dataset(dataset, 0.25)
    train_loader, val_loader = DataLoader(train_dataset, args.batch_size, shuffle=True), DataLoader(val_dataset, args.batch_size, shuffle=True)
    classification_strings = [f'image of {class_name} language text' for class_name in dataset.classes] # prompt tuning returned best - # classification_strings = ['image of odiya language text', 'image of english language text']
    classification_strings = ['X ' * args.n_ctx + class_name for class_name in dataset.classes] # prompt tuning returned best - # classification_strings = ['image of odiya language text', 'image of english language text']

    if args.device == 'cpu':
        model.float()
    else:
        clip.model.convert_weights(model)  # actually this line is unnecessary since clip by default is already on float16

    optimizer = torch.optim.Adam([model.trainable_param], lr=args.lr, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.2) # taken from a paper.
    criterion = nn.CrossEntropyLoss()

    losses = []
    train_accuracies = []
    val_accuracies = []

    if not os.path.exists(args.checkpoint_dir):
      os.makedirs(args.checkpoint_dir)

    if args.method == 'base':
        pass

    elif args.method == 'coop':
        train_acc = accuracy_clip_coop(args, model, preprocess, train_loader, classification_strings)
        val_acc = accuracy_clip_coop(args, model, preprocess, val_loader, classification_strings)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        print(f'before training:  train:{train_acc * 100:.2f}   val:{val_acc * 100:.2f}')
        for epoch in range(args.epochs):
            losses = losses + train_clip_coop(args, model, preprocess, train_loader, optimizer, criterion, classification_strings)
            train_acc = accuracy_clip_coop(args, model, preprocess, train_loader, classification_strings)
            val_acc = accuracy_clip_coop(args, model, preprocess, val_loader, classification_strings)
            train_accuracies.append(train_acc)
            val_accuracies.append(val_acc)
            print(f'end of epoch {epoch+1}:  train:{train_acc * 100:.2f}   val:{val_acc * 100:.2f}')

            # implement checkpointing here
            # save_checkpoint(args, model, epoch) # some bug here idk

        # implement final accuracy and stats and eval printing here

        plt.plot(losses)
        plt.show()


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--method", type=str, default='coop')
    parser.add_argument("--dataset-dir", type=str, default="dataset")
    parser.add_argument("--val-dir", type=str, default=None)
    parser.add_argument("--device", type=str, default="cuda")
    parser.add_argument("--clip-model-name", type=str, default="ViT-B/16") # RN50x16, ViT-B/16
    parser.add_argument("--batch-size", type=int, default=8)
    parser.add_argument("--epochs", type=int, default=10) # should be around 50-200? idk. check some reference papers on finetuning clip
    parser.add_argument("--lr", type=float, default=3e-4) # low might be because of initial learning rate explosion. finetune the transformers onto this.
    parser.add_argument("--checkpoint-dir", type=str, default='checkpoints')
    parser.add_argument("--checkpoint-name", type=str, default='run1coop')
    parser.add_argument("--load-from-checkpoint", type=str, default=None)
    parser.add_argument("--n-ctx", type=int, default=16) # the size of the coop trainable parameters
    args = parser.parse_args(args=[])
    return args

args = get_args()
main(args)


100%|██████████| 107/107 [00:05<00:00, 17.92it/s]
100%|██████████| 36/36 [00:01<00:00, 21.18it/s]


before training:  train:53.57   val:50.18


100%|██████████| 107/107 [01:31<00:00,  1.17it/s]
100%|██████████| 107/107 [00:06<00:00, 17.67it/s]
100%|██████████| 36/36 [00:01<00:00, 19.96it/s]


end of epoch 1:  train:99.06   val:98.95


100%|██████████| 107/107 [01:30<00:00,  1.18it/s]
100%|██████████| 107/107 [00:05<00:00, 18.46it/s]
100%|██████████| 36/36 [00:01<00:00, 21.03it/s]


end of epoch 2:  train:99.65   val:99.65


 69%|██████▉   | 74/107 [01:02<00:26,  1.23it/s]