Introducing manual compression of image captions on stale (offline) data

In [7]:
%pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer, BitsAndBytesConfig, AutoProcessor, LlavaForConditionalGeneration
from transformers import AdamW
from datasets import load_dataset
import torch
from collections import Counter
import fiftyone
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torch.nn.functional as F
import numpy as np
import os
import time
import glob

# uncommon features  - events of interest
# loss less compression -  sudden more bits indicates anomaly can be flagged, alerts when anomaly detected - may shift to lossy video streaming
# lossy compression of noisy data varying distortion rate - accuracy is increasing
# video to video lossy reconstruction possibility
# image frame to image frame on a need basis - human satisfaction metric, GPT based comparison, RLHF based comparison

  from .autonotebook import tqdm as notebook_tqdm


Migrating database to v0.23.4


In [29]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
# Load the pre-trained model and its components
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# # Loading the above for LlavVA
# model_llava = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
# processor_llava = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

In [7]:
# Load a dataset (for example, a subset of the COCO dataset)
# TODO: Potential datasets with repetitive nature that can be used: MS COCO, Flickr30k, Visual Genome, SBU Captions - get correlated datasets from Nikil

# load small part of the coco dataset from all the .jpg images in datasets/mscoco/test2015
dataset = load_dataset("datasets/mscoco/test2017/", split="train[:2]")

Resolving data files: 100%|██████████| 40670/40670 [00:00<00:00, 512517.86it/s]


In [8]:
# TODO: use different values of max_length and try out results

def generate_caption_with_logits(image, max_length=10):
    # Prepare the inputs
    inputs = feature_extractor(images=image, return_tensors="pt")
    pixel_values = inputs.pixel_values

    model.eval()
    with torch.no_grad():
        # Perform a forward pass to get the logits
        encoder_outputs = model.encoder(pixel_values=pixel_values)
        encoder_hidden_states = encoder_outputs.last_hidden_state
        
        # Prepare decoder input_ids. Typically, you start with the start-of-sentence token
        decoder_input_ids = torch.tensor([tokenizer.bos_token_id]).unsqueeze(0).to(encoder_hidden_states.device)
        decoder_attention_mask = torch.ones_like(decoder_input_ids)
        
        # Initialize an empty tensor for logits (for simplicity, accumulating logits for each step)
        logits_list = []
        
        for i in range(max_length):
            decoder_outputs = model.decoder(input_ids=decoder_input_ids,
                                            attention_mask=decoder_attention_mask,
                                            encoder_hidden_states=encoder_hidden_states)
            logits = decoder_outputs.logits[:, -1, :]  # Get the logits for the last token generated
            logits_list.append(logits)
            
            predicted_id = torch.argmax(logits, dim=-1).unsqueeze(-1)
            # Check if EOS token is generated
            if predicted_id[0, 0] == tokenizer.eos_token_id:
                print ("EOS has been generated")
                # break # since model.generate() does this automatically
            
            # Append predicted token ID to decoder_input_ids for generating next token
            decoder_input_ids = torch.cat([decoder_input_ids, predicted_id], dim=-1)
            decoder_attention_mask = torch.cat([decoder_attention_mask, torch.ones_like(predicted_id)], dim=-1)
            
        # Concatenate logits from each step to get the final logits tensor
        # make all elements of logits_list 3D by adding a dimension in the middle
        logits_list = [logits.unsqueeze(1) for logits in logits_list]
        logits = torch.cat(logits_list, dim=1)
        # add logic to repeat the remaining number of (127-i) tokens with EOS token logits (simply repeat the last token logits) to make it length 128

        # Decode the generated token IDs to get the caption
        predicted_ids = torch.argmax(logits, dim=-1)
        caption = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
        
    return logits, predicted_ids, caption

# Example usage
# image: A PIL image or a tensor representing your input image
# logits, predicted_ids, caption = generate_caption_with_logits(image, model, feature_extractor, tokenizer)


In [9]:
# Iterate over the dataset and generate captions
generated_captions = []
generated_logits = []
generated_predicted_ids = []

for data in dataset:
    image = data['image']
    logits, predicted_ids, caption = generate_caption_with_logits(image)
    generated_captions.append(caption)
    generated_logits.append(logits)
    generated_predicted_ids.append(predicted_ids)

# concatenate generated logits along first dimension to make 3D tensor
generated_logits = torch.cat(generated_logits, dim=0)
print (generated_logits.shape)

EOS has been generated
torch.Size([2, 10, 50257])


In [10]:
generated_captions

['a green truck parked next to a curb ',
 'a baseball player swinging a bat at a ball ']

In [11]:
def update_encoding_dict(captions, encoding_dict):
    for caption in captions:
        words = caption.split() # splitting the caption into words - pretty bad strategy since we are currently splitting into tokens
        encoding_dict.update(words) # purpose of update is to add the words to the dictionary if they don't exist
    return encoding_dict

In [12]:
encoding_dict = Counter() # Counter is a subclass of dictionary for counting hashable objects
threshold = 0 # threshold for word frequency # TODO: find a good threshold

update_encoding_dict(generated_captions, encoding_dict)

print (encoding_dict)

# Optionally, create a more compressed form based on frequency
compressed_dict = {word: idx for idx, (word, freq) in enumerate(encoding_dict.items()) if freq > threshold}

# Create the dictionary of entropy values from encoding_dict
entropy_dict = {word: -np.log(encoding_dict[word] / sum(encoding_dict.values())) 
                for word in encoding_dict}

print (entropy_dict)
# print 1/elem for elem in encoding_dict.values()
reciprocal_dict = {word: 1/(encoding_dict[word]+1) for word in encoding_dict}
print (reciprocal_dict)

Counter({'a': 5, 'green': 1, 'truck': 1, 'parked': 1, 'next': 1, 'to': 1, 'curb': 1, 'baseball': 1, 'player': 1, 'swinging': 1, 'bat': 1, 'at': 1, 'ball': 1})
{'a': 1.2237754316221157, 'green': 2.833213344056216, 'truck': 2.833213344056216, 'parked': 2.833213344056216, 'next': 2.833213344056216, 'to': 2.833213344056216, 'curb': 2.833213344056216, 'baseball': 2.833213344056216, 'player': 2.833213344056216, 'swinging': 2.833213344056216, 'bat': 2.833213344056216, 'at': 2.833213344056216, 'ball': 2.833213344056216}
{'a': 0.16666666666666666, 'green': 0.5, 'truck': 0.5, 'parked': 0.5, 'next': 0.5, 'to': 0.5, 'curb': 0.5, 'baseball': 0.5, 'player': 0.5, 'swinging': 0.5, 'bat': 0.5, 'at': 0.5, 'ball': 0.5}


In [13]:
class CaptionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, captions):
        self.encodings = encodings
        self.captions = captions

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.captions[idx])
        return item

    def __len__(self):
        return len(self.captions)

In [23]:
# Assuming `dataset` is your dataset containing images and captions
images = [data['image'] for data in dataset]
caption_ids = generated_predicted_ids

# Process images and captions
inputs = feature_extractor(images=images, return_tensors="pt")

# Create dataset and dataloader
train_dataset = CaptionDataset(inputs, caption_ids)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [27]:
class LoRALayer(nn.Module):
    def __init__(self, original_weight, rank):
        super(LoRALayer, self).__init__()
        self.original_weight = original_weight
        self.rank = rank
        self.U = nn.Parameter(torch.Tensor(self.original_weight.size(0), self.rank))
        self.V = nn.Parameter(torch.Tensor(self.rank, self.original_weight.size(1)))
        nn.init.xavier_uniform_(self.U)
        nn.init.xavier_uniform_(self.V)

    def forward(self):
        return self.original_weight + self.U @ self.V

In [28]:
# Modify the first attention layer of the encoder
# TODO: Try modifying other layers as well and check the results
lora_layers = []

with torch.no_grad():
    original_weight = model.encoder.encoder.layer[0].attention.output.dense.weight
    lora_layer = LoRALayer(original_weight, rank=10).forward()  # Choose an appropriate rank
    # assign the new layer to the model
    model.encoder.encoder.layer[0].attention.output.dense.weight.copy_(lora_layer)
    # add the layer of the model to the list of LoRA layers
    lora_layers.append(model.encoder.encoder.layer[0].attention.output.dense)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [24]:
def is_lora_param(param, lora_layer):
    # check if the parameter is part of the LoRA layer
    print (lora_layer.parameters())
    print ("nuj")
    print (param)
    return param in lora_layer.parameters()


In [25]:
# add two extra dimensions to generated_logits
generated_probs = F.softmax(generated_logits, dim=-1)
generated_probs_expanded = generated_probs.unsqueeze(0).unsqueeze(0)

In [26]:
def calculate_entropy_elbo_difference (prob_differences, D):
    sigma = 0.01
    # reduce prob_differences to 4D from 5D by taking norm square along the last dimension
    prob_differences = torch.norm(prob_differences, dim=-1)
    print (prob_differences.shape)
    # do elementwise for prob_differences: suqare
    prob_differences = prob_differences**2
    # take sum of all elements of prob_differences, hence scalar, then divide by 2*sigma^2*D
    return torch.sum(prob_differences) / (2*sigma**2*D)

In [19]:
def calculate_entropy_elbo_cross_entropy (prob_differences, D):
    pass

In [20]:
def calculate_entropy (prob_differences, D):
    pass

In [21]:
def custom_loss(outputs, batch, encoding_dict, lora_layers, lambda_val=10, lora_lambda_val = 0.01):
    # Standard captioning loss
    standard_loss = outputs.loss

    # Additional compression loss
    compression_loss = 0
    # add two dimensions to output probs at 2 and 3
    outputs_probs = F.softmax(outputs.logits, dim=-1)
    outputs_probs_expanded = outputs_probs.squeeze(1).unsqueeze(2).unsqueeze(3)
    prob_differences = generated_probs_expanded - outputs_probs_expanded
    print ("prob_differences.shape = ", outputs_probs.shape, generated_probs_expanded.shape, outputs_probs_expanded.shape, prob_differences.shape)
    # calculate the compression loss
    # find number of elements in generated_predicted_logits
    D = generated_probs.numel()
    compression_loss = lambda_val* calculate_entropy_elbo_difference (prob_differences, D)
    

    # Optionally, add a term for LoRA regularization if needed
    lora_regularization = 0
    # for param in model.parameters():
    #     for lora_layer in lora_layers:
    #         if is_lora_param(param, lora_layer):
    #             lora_regularization += torch.norm(param)
    print (standard_loss, compression_loss)

    return standard_loss + compression_loss + lora_lambda_val * lora_regularization

In [22]:
# Fine tuning using custom loss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

lr = 1e-4
num_epochs = 30

optimizer = AdamW([param for param in model.parameters() if param.requires_grad], lr=lr)

for epoch in range(num_epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = custom_loss(outputs, batch, encoding_dict, lora_layers)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.captions[idx])
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
  0%|          | 0/1 [00:00<?, ?it/s]


NameError: name 'lora_layers' is not defined

In [None]:
# create directory to save the model if it doesn't exist
if not os.path.exists("models"):
    os.mkdir("models")
# save model checkpoint to models directory using current timestamp and date
torch.save(model.state_dict(), f"models/{time.strftime('%Y%m%d-%H%M%S')}.pth")


In [None]:
# load latest model checkpoint among all the saved models
latest_model = torch.load(max(glob.glob('models/*.pth'), key=os.path.getctime))
# load the model with the latest checkpoint
model.load_state_dict(latest_model)

<All keys matched successfully>

In [None]:
# Generate captions for the test dataset
generated_captions_custom_model = []
# Iterate over the dataset and generate captions
for data in dataset:
    image = data['image']
    generated_logits, generated_predicted_ids, caption = generate_caption_with_logits(image)
    generated_captions_custom_model.append(caption)


EOS has been generated
EOS has been generated
EOS has been generated


In [None]:
# Encode compressed dictionary word using manual huffman encoding

In [None]:
# Replace compressed_dict words occurring in the generated_captions_custom_model with their corresponding huffman encoding

In [None]:
# compare encoded generated_captions_custom_model + huffman encoding dictionary information with the original generated_captions to calculate compression ratio

In [None]:
# print generated_captions and generated_captions_custom_model elementwise to compare the results
for i in range(len(generated_captions)):
    print (generated_captions[i], "WAIT", generated_captions_custom_model[i])

a green truck parked next to a curb  WAIT a green truck parked next to a curb next to
a man is walking down the street with a skate WAIT a man walking down the street with a skateboard
a baseball player swinging a bat at a ball  WAIT a man swinging at a bat at a ball 
a cow is standing in a field of grass  WAIT a cow standing in grass field  
a black dog sitting in the back of a truck WAIT a black black dog sitting in the back of a
a man wearing a bow tie and glasses  WAIT a man wearing glasses and bow tie and glasses 
a dining room table with a large bowl of food WAIT a room with a large table with a large bowl
a man standing next to a wall with a bunch WAIT a man standing next to a bunch of bunch of
a man is playing tennis on a clay court  WAIT a man is playing tennis on a court 
a man and a woman playing a game of fr WAIT a man playing a game of fr fr fr game
