Using Finetuned CLIP's image and text encoder for feature extraction, training on few of the images and the 512 dimension embedding is used for training the ML models

In [1]:
import os
import clip
import torch
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from skimage import io, transform
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [2]:
#Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
checkpoint = torch.load("./model_30_5e7_001_fixed.pt")
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [3]:
number =5 #No. of examples per class

In [4]:
images_root_pth = './birds/CUB_200_2011/images/'
text_root_pth = './birds/text/'
imgID_pth_df = pd.read_csv('./birds/CUB_200_2011/images.txt', sep=' ', header=None, names=['img_id', 'sub_pth'])
train_split_df = pd.read_csv('./birds/CUB_200_2011/train_test_split.txt', sep=' ', names=['img_id', 'is_training'])
class_names = pd.read_csv('./birds/CUB_200_2011/classes.txt', sep=' ', header=None, names=['class_id', 'class_name'])
class_names.class_name = class_names.class_name.map(lambda x: x.split('.')[1].lower())

cub_dataset_df = imgID_pth_df.merge(train_split_df, on='img_id', how='inner')

target=[]
for pth in cub_dataset_df['sub_pth']:
    target.append(int(pth.split('.')[0]))
df = pd.DataFrame(target, columns=['class'])
train_df=cub_dataset_df[cub_dataset_df.is_training==1]
final_df = train_df.join(df)

fewshot_df=pd.DataFrame(columns=['img_id', 'sub_pth', 'is_training', 'class'])
for i in range(1,201):
    few_shot=final_df[final_df['class']==i].sample(n=number, random_state=42)
    fewshot_df=fewshot_df.append(few_shot, ignore_index=True)

fewshot_df

  fewshot_df=fewshot_df.append(few_shot, ignore_index=True)


Unnamed: 0,img_id,sub_pth,is_training,class
0,56,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
1,35,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
2,49,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
3,37,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
4,18,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1
...,...,...,...,...
995,11782,200.Common_Yellowthroat/Common_Yellowthroat_00...,1,200
996,11759,200.Common_Yellowthroat/Common_Yellowthroat_00...,1,200
997,11774,200.Common_Yellowthroat/Common_Yellowthroat_00...,1,200
998,11762,200.Common_Yellowthroat/Common_Yellowthroat_00...,1,200


In [5]:


class CUBDataset(Dataset):
    """CUB dataset."""

    def __init__(self, dataframe, img_root_dir, text_root_dir, transform=None):
        """
        Args:
            dataframe (pd.DataFrame): Dataframe with paths and train/test split information.
            root_dir (string): Root directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.cub_img_df = dataframe
        self.img_root_dir = img_root_dir
        self.text_root_dir = text_root_dir
        self.transform = transform

    def __len__(self):
        return len(self.cub_img_df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        sub_pth = self.cub_img_df.iloc[idx, 1]
        
        #IMAGE PROCESSING
        img_name = os.path.join(self.img_root_dir,
                                sub_pth)
        image = io.imread(img_name)
        target = int(sub_pth.split('.')[0])
        if self.transform:
            image = self.transform(image)
        
        #TEXT PROCESSING
        pth=sub_pth[:-4]+'.txt'
        text_file_name = os.path.join(self.text_root_dir, pth)
        myfile=open(text_file_name,"r")
        content=myfile.readline()
        text = clip.tokenize(content)
        target = int(sub_pth.split('.')[0])

        return image, text, target

# loading CUB-200-2011 dataset
train_set = CUBDataset(fewshot_df, images_root_pth, text_root_pth, transform=transforms.Compose([transforms.ToPILImage(), preprocess]))
test_set = CUBDataset(cub_dataset_df[cub_dataset_df.is_training==0], images_root_pth, text_root_pth, transform=transforms.Compose([transforms.ToPILImage(), preprocess]))

In [6]:
def get_img_txt_features(dataset,s1 = None):
    all_img_features = []
    all_txt_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, details, labels in tqdm(DataLoader(dataset, batch_size=16,shuffle=s1)):
            img_features = model.encode_image(images.to(device))
            txt_features = model.encode_text(torch.cat(tuple(details)).to(device))

            all_img_features.append(img_features)
            all_txt_features.append(txt_features)
            all_labels.append(labels)
    return torch.cat(all_img_features).cpu(), torch.cat(all_txt_features).cpu(), torch.cat(all_labels).cpu()


In [7]:
train_image_features, train_txt_features, train_labels = get_img_txt_features(train_set, s1 = True)
test_image_features, test_txt_features, test_labels = get_img_txt_features(test_set, s1 = False)

100%|██████████| 63/63 [00:12<00:00,  5.16it/s]
100%|██████████| 363/363 [01:06<00:00,  5.48it/s]


In [8]:
print(train_image_features.shape, train_txt_features.shape, train_labels.shape, test_image_features.shape,  test_image_features.shape, test_labels.shape)

torch.Size([1000, 512]) torch.Size([1000, 512]) torch.Size([1000]) torch.Size([5794, 512]) torch.Size([5794, 512]) torch.Size([5794])


In [9]:
train_features=torch.cat((train_image_features, train_txt_features),dim=1)
test_features=torch.cat((test_image_features, test_txt_features),dim=1)

In [10]:
print(train_features.shape, test_features.shape)

torch.Size([1000, 1024]) torch.Size([5794, 1024])


In [11]:
train_features=train_features.numpy()
test_features=test_features.numpy()
train_labels=train_labels.numpy()
test_labels=test_labels.numpy()

In [12]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(train_features, train_labels)
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy Logistic = {accuracy:.3f}")

from sklearn import neighbors
clf=neighbors.KNeighborsClassifier()
clf.fit(train_features, train_labels)
predictions = clf.predict(test_features)

accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy KNN= {accuracy:.3f}")

from sklearn.svm import SVC
scl=SVC(kernel='linear')
scl.fit(train_features, train_labels)
predictions = scl.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy SVM-Linear= {accuracy:.3f}")

from sklearn.svm import SVC
sc=SVC(kernel='rbf')
sc.fit(train_features, train_labels)
predictions = sc.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy SVM-RBF= {accuracy:.3f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.


Accuracy Logistic = 50.483


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.


Accuracy KNN= 34.398


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.


Accuracy SVM-Linear= 49.689
Accuracy SVM-RBF= 43.994


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
