In [1]:
import torch
import torchvision
from torchvision import transforms
from PIL import Image
import requests
import time
import numpy as np
import io
from io import BytesIO
import matplotlib.pyplot as plt
%matplotlib inline
import torchvision.models as models
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import os
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import random
from tqdm import tqdm
import json
from torch.optim.lr_scheduler import CosineAnnealingLR
import threading
import torchvision.models as models
import torch.nn as nn
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
from nltk.corpus import wordnet
from caption_transforms import SimCLRData_Caption_Transform
from image_transforms import SimCLRData_image_Transform
from dataset import FlickrDataset
from models import ResNetSimCLR,OpenAI_SIMCLR
from utils import get_gpu_stats,layerwise_trainable_parameters,count_trainable_parameters
from metrics import ContrastiveLoss
from metrics import LARS,Optimizer_simclr
from logger import Logger
from train_fns import train, test


In [2]:
get_gpu_stats()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cuda
No of GPUs i have is 2
0
My Graphic Card is Tesla P100-PCIE-16GB
Is Cuda Available True


In [3]:
class Flickr30kDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, token_file_path, transform=None):
        self.root_dir = root_dir
        self.token_file_path = token_file_path
        self.transform = transform
        self.captions = self._load_captions()

    def _load_captions(self):
        with open(self.token_file_path) as tokenfile:
            captions = tokenfile.readlines()
        caption_dict = {}
        for caption in captions:
            caption_parts = caption.strip().split('#')
            image_file_name = caption_parts[0]
            caption_text_parts = caption_parts[1].split('\t')
            caption_number = int(caption_text_parts[0].replace('#',''))
            caption_text = caption_text_parts[1]
            if image_file_name not in caption_dict:
                caption_dict[image_file_name] = []
            caption_dict[image_file_name].append(caption_text)
        return caption_dict

    def __len__(self):
        return len(self.captions)

    def __getitem__(self, idx):
        image_filename = list(self.captions.keys())[idx]
        image_path = os.path.join(self.root_dir, image_filename)
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        captions = self.captions[image_filename]
        return image, captions

In [4]:
class OpenAI_SIMCLR_debug(nn.Module):
    def __init__(self, model='openai-gpt', projection_dim=128,layers_to_train=['h.11'],encoder_last_layer=None):
        super(OpenAI_SIMCLR_debug, self).__init__()

        # Load backbone and tokenizer
        self.backbone = OpenAIGPTModel.from_pretrained(model)
        self.config = self.backbone.config
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(model)
        self.encoder_last_layer=encoder_last_layer
        # Set requires_grad for each parameter based on layers_to_train
        for name, param in self.backbone.named_parameters():
            if any(name.startswith(prefix) for prefix in layers_to_train):
                param.requires_grad = True
            else:
                param.requires_grad = False
        if encoder_last_layer:
            self.fc_layer=nn.Linear(self.config.n_embd,encoder_last_layer)
            projection_head_input=encoder_last_layer
        else:
            projection_head_input=self.config.n_embd
        # Projection head
        self.projection_head = nn.Sequential(
            nn.Linear(projection_head_input ,projection_head_input),
            nn.ReLU(),
            nn.Linear(projection_head_input, projection_dim)
        )
        
    def forward(self, texts,device):
        
        # Tokenize input text
        tokenized_texts = [self.tokenizer.tokenize(text) for text in texts]
        input_ids = [self.tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]
        tokens_tensor = pad_sequence([torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=0)
        tokens_tensor = tokens_tensor.to(device)
        # Get text features from backbone
        all_hidden_states = self.backbone(tokens_tensor)
        features = torch.mean(all_hidden_states, dim=1)  # Shape: (1, 768)

        if self.encoder_last_layer:
            features=self.fc_layer(features)
        # Pass text features through projection head
        projections = self.projection_head(features)
        return features,projections

In [5]:
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
"""dataset = Flickr30kDataset('/work/08629/pradhakr/maverick2/cv_project/flickr30k-images', 
                           '/work/08629/pradhakr/maverick2/cv_project/flickr30k_captions/results_20130124.token',
                          transform=preprocess)"""
train_dataset = FlickrDataset('data/', "data/train", 'train',
                              image_transform=SimCLRData_image_Transform(),
                              caption_transform=SimCLRData_Caption_Transform())

test_dataset = FlickrDataset('data/', "data/test", 'test',
                             image_transform=SimCLRData_image_Transform(),
                             caption_transform=SimCLRData_Caption_Transform())
# Split the dataset into train, validation, and test sets
#train_set, val_set, test_set = torch.utils.data.random_split(dataset, [29783, 1000, 1000])
batch_size = 128
train_loader = DataLoader(train_dataset,
                          batch_size=batch_size, 
                          shuffle=True, 
                          num_workers=4,
                          pin_memory=True)

val_loader = DataLoader(test_dataset, 
                         batch_size=batch_size, 
                         shuffle=False, 
                         num_workers=4, 
                         pin_memory=True)

In [6]:
def train(dataloader, image_model, text_model, optimizer_image, optimizer_text, criterion,device,
          scheduler_image=None, scheduler_text=None, trade_off_ii=1, trade_off_cc=1,trade_off_ic=1,trade_off_ci=1):
    loss_epoch = 0

    for idx, batch in enumerate(dataloader):
        image_model.train()
        text_model.train()

        batch_size = batch[0].shape[0]
        image1, image2, caption1, caption2 = batch[0], batch[1], batch[3], batch[4]

        _, embed_image1 = image_model(image1, device)
        _, embed_image2 = image_model(image2, device)
        _, embed_caption1 = text_model(caption1, device)
        _, embed_caption2 = text_model(caption2, device)

        contrastive_loss = (trade_off_ii * criterion(embed_image1, embed_image2, batch_size) +
                      trade_off_cc * criterion(embed_caption1, embed_caption2, batch_size) +
                      trade_off_ic * criterion(embed_image1, embed_caption2, batch_size) +
                     trade_off_ci * criterion(embed_caption1, embed_image2, batch_size) )

        contrastive_loss.backward()

        optimizer_image.step()
        optimizer_text.step()

        optimizer_image.zero_grad()
        optimizer_text.zero_grad()
        
        loss_epoch += contrastive_loss.item()

        del batch, image1, image2, caption1, caption2, embed_image1, embed_image2, embed_caption1, embed_caption2, contrastive_loss
        torch.cuda.empty_cache()
    if scheduler_image:
        scheduler_image.step()
    if scheduler_text:
        scheduler_text.step()
    epoch_loss = loss_epoch / len(dataloader)
    return epoch_loss
def test(dataloader, image_model, text_model, criterion, device, trade_off_ii=1, trade_off_cc=1,trade_off_ic=1,trade_off_ci=1):

    loss_epoch = 0

    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            image_model.eval()
            text_model.eval()
            batch_size = batch[0].shape[0]
            image1, image2, caption1, caption2 = batch[0], batch[1], batch[3], batch[4]

            _, embed_image1 = image_model(image1, device)
            _, embed_image2 = image_model(image2, device)
            _, embed_caption1 = text_model(caption1, device)
            _, embed_caption2 = text_model(caption2, device)

            contrastive_loss = (trade_off_ii * criterion(embed_image1, embed_image2, batch_size) +
                      trade_off_cc * criterion(embed_caption1, embed_caption2, batch_size) +
                      trade_off_ic * criterion(embed_image1, embed_caption2, batch_size) +
                     trade_off_ci * criterion(embed_caption1, embed_image2, batch_size) )

            loss_epoch += contrastive_loss.item()

            del batch, image1, image2, caption1, caption2, embed_image1, embed_image2, embed_caption1, embed_caption2, contrastive_loss
            torch.cuda.empty_cache()

    epoch_loss = loss_epoch / len(dataloader)
    return epoch_loss

In [7]:
projection_dim=128
#encoder_last_layer=2048
image_learning_rate = 0.03
text_learning_rate=0.03
momentum = 0.9
temperature = 0.07
weight_decay = 0.0001
optimizer_type = 'sgd'
total_epochs=100
trade_off_ii=1
trade_off_cc=1
trade_off_ic=1
trade_off_ci=1
model_resnet = ResNetSimCLR(
    model='resnet50',
    projection_dim=projection_dim,
    layers_to_train=['layer3','layer4'],
).to(device)

# Print total number of trainable parameters of ResNetSimCLR model

# Initialize OpenAI_SIMCLR model
gpt_model = OpenAI_SIMCLR(
    model='openai-gpt',
    projection_dim=projection_dim,
    layers_to_train=['h.10','h.11'],
).to(device)

# Define loss function
NXTENT_loss = ContrastiveLoss(device, temperature=temperature)

# Define optimizers and schedulers
optimizer_image = Optimizer_simclr(optimizer_name=optimizer_type,
                                   model_parameters=model_resnet.parameters(),
                                   lr=image_learning_rate,
                                   momentum=momentum,
                                   weight_decay=weight_decay)

scheduler_image = optimizer_image.scheduler
optimizer_image = optimizer_image.optimizer

optimizer_text = Optimizer_simclr(optimizer_name=optimizer_type,
                                  model_parameters=gpt_model.parameters(),
                                  lr=text_learning_rate,
                                  momentum=momentum,
                                  weight_decay=weight_decay)

scheduler_text = optimizer_text.scheduler
optimizer_text = optimizer_text.optimizer

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [8]:
for epoch in tqdm(range(100)):
    start = time.time()
    train_loss = train(dataloader=train_loader, 
                           image_model=model_resnet, 
                           text_model=gpt_model,
                           optimizer_image=optimizer_image, 
                           optimizer_text=optimizer_text, 
                           criterion=NXTENT_loss,
                            device=device,
                           scheduler_image=scheduler_image,
                           scheduler_text=scheduler_text,
                           trade_off_ii=trade_off_ii, 
                           trade_off_cc=trade_off_cc,
                           trade_off_ic=trade_off_ic,
                           trade_off_ci=trade_off_ci)
    test_loss = test(dataloader=val_loader, 
                     image_model=model_resnet,
                     text_model=gpt_model,
                     criterion=NXTENT_loss,
                     device=device,
                     trade_off_ii=trade_off_ii,
                     trade_off_cc=trade_off_cc,
                     trade_off_ic=trade_off_ic,
                     trade_off_ci=trade_off_ci)
    end = time.time()
    print('trainloss',round(train_loss,3),'testloss',round(test_loss,3),'time',round(end-start,1))

  0%|          | 0/100 [00:38<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 98.00 MiB (GPU 0; 15.90 GiB total capacity; 5.11 GiB already allocated; 66.75 MiB free; 5.25 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
train_loss

In [None]:
index=7
print(a[index])
print(b[index])

In [None]:
print(b[index])