Introducing manual compression of image captions on stale (offline) data

In [2]:
%pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer, BitsAndBytesConfig, AutoProcessor, LlavaForConditionalGeneration
from transformers import AdamW
from datasets import load_dataset
import torch
from collections import Counter
import fiftyone
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torch.nn.functional as F
import numpy as np
import os
import time
import glob

# uncommon features  - events of interest
# loss less compression -  sudden more bits indicates anomaly can be flagged, alerts when anomaly detected - may shift to lossy video streaming
# lossy compression of noisy data varying distortion rate - accuracy is increasing
# video to video lossy reconstruction possibility
# image frame to image frame on a need basis - human satisfaction metric, GPT based comparison, RLHF based comparison

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Check if GPU is available and set the device accordingly
if torch.cuda.is_available():
    print("Using the GPU!")
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
    device = torch.device("cuda")

In [5]:
# Load the pre-trained model and its components
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# # Loading the above for LlavVA
# model_llava = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
# processor_llava = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")



In [35]:
# Load a dataset (for example, a subset of the COCO dataset)
# TODO: Potential datasets with repetitive nature that can be used: MS COCO, Flickr30k, Visual Genome, SBU Captions - get correlated datasets from Nikil

# load small part of the coco dataset from all the .jpg images in datasets/mscoco/test2015
dataset = load_dataset("datasets/mscoco/test2015/", split="test[:100]")

Resolving data files: 100%|██████████| 81434/81434 [00:00<00:00, 1140773.36it/s]


In [39]:
# TODO: use different values of max_length and try out results

def generate_caption_with_logits(image, max_length=15):
    # Prepare the inputs
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)
    pixel_values = inputs.pixel_values
    
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        # Perform a forward pass to get the logits
        encoder_outputs = model.encoder(pixel_values=pixel_values)
        encoder_hidden_states = encoder_outputs.last_hidden_state
        
        # Prepare decoder input_ids. Typically, you start with the start-of-sentence token
        decoder_input_ids = torch.tensor([tokenizer.bos_token_id]).unsqueeze(0).to(encoder_hidden_states.device)
        decoder_attention_mask = torch.ones_like(decoder_input_ids)
        
        # Initialize an empty tensor for logits (for simplicity, accumulating logits for each step)
        logits_list = []
        
        for i in range(max_length):
            decoder_outputs = model.decoder(input_ids=decoder_input_ids,
                                            attention_mask=decoder_attention_mask,
                                            encoder_hidden_states=encoder_hidden_states)
            logits = decoder_outputs.logits[:, -1, :]  # Get the logits for the last token generated
            logits_list.append(logits)
            
            predicted_id = torch.argmax(logits, dim=-1).unsqueeze(-1)
            # Check if EOS token is generated
            if predicted_id[0, 0] == tokenizer.eos_token_id:
                print ("EOS has been generated")
                # break # since model.generate() does this automatically
            
            # Append predicted token ID to decoder_input_ids for generating next token
            decoder_input_ids = torch.cat([decoder_input_ids, predicted_id], dim=-1)
            decoder_attention_mask = torch.cat([decoder_attention_mask, torch.ones_like(predicted_id).to(device)], dim=-1)
            
        # Concatenate logits from each step to get the final logits tensor
        # make all elements of logits_list 3D by adding a dimension in the middle
        logits_list = [logits.unsqueeze(1) for logits in logits_list]
        logits = torch.cat(logits_list, dim=1)
        # add logic to repeat the remaining number of (127-i) tokens with EOS token logits (simply repeat the last token logits) to make it length 128

        # Decode the generated token IDs to get the caption
        predicted_ids = torch.argmax(logits, dim=-1)
        caption = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
        
    return logits, predicted_ids, caption

# Example usage
# image: A PIL image or a tensor representing your input image
# logits, predicted_ids, caption = generate_caption_with_logits(image, model, feature_extractor, tokenizer)


In [40]:
# Iterate over the dataset and generate captions
generated_captions = []
generated_logits = []
generated_predicted_ids = []

for data in dataset:
    image = data['image']
    logits, predicted_ids, caption = generate_caption_with_logits(image)
    generated_captions.append(caption)
    generated_logits.append(logits)
    generated_predicted_ids.append(predicted_ids)

# concatenate generated logits along first dimension to make 3D tensor
generated_logits = torch.cat(generated_logits, dim=0)
print (generated_logits.shape)

EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
torch.Size([100, 15, 50257])


In [41]:
generated_captions

['a green truck next to a curb next to a curb    a',
 'a man walking down the street with a a a a a a a a',
 'a man with a bat on a man with a man with a man with',
 'cows standing in a green grass green grass green grass green grass green grass',
 'a green truck parked next to a curb next to a curb   ',
 'man with a man with a man with a man with a man with a',
 'a table with a wooden table with a a a a a a a a',
 'a man is standing next to a a a a a a a a a',
 'a man with a tennis racket on a tennis court  a man with',
 'men walking in a grassy area with a man with a skateboard ',
 'children standing next to a a a a a a a a a a a',
 'people standing in a a a a a a a a a a a a',
 'z to a a a a a a a a a a a a a',
 'a white dog with a green and white dog  a a a a',
 'h green green grass with a green green grass with a green grass ',
 'birds on a green green green next to a green green green  a',
 'a green green green green green green green green green green green green green green',
 

In [42]:
def update_encoding_dict(captions, encoding_dict):
    for caption in captions:
        words = caption.split() # splitting the caption into words - pretty bad strategy since we are currently splitting into tokens
        encoding_dict.update(words) # purpose of update is to add the words to the dictionary if they don't exist
    return encoding_dict

In [43]:
encoding_dict = Counter() # Counter is a subclass of dictionary for counting hashable objects
threshold = 0 # threshold for word frequency # TODO: find a good threshold

update_encoding_dict(generated_captions, encoding_dict)

print (encoding_dict)

# Optionally, create a more compressed form based on frequency
compressed_dict = {word: idx for idx, (word, freq) in enumerate(encoding_dict.items()) if freq > threshold}

# Create the dictionary of entropy values from encoding_dict
entropy_dict = {word: -np.log(encoding_dict[word] / sum(encoding_dict.values())) 
                for word in encoding_dict}

print (entropy_dict)
# print 1/elem for elem in encoding_dict.values()
reciprocal_dict = {word: 1/(encoding_dict[word]+1) for word in encoding_dict}
print (reciprocal_dict)

Counter({'a': 905, 'with': 101, 'man': 89, 'green': 74, 'street': 33, 'walking': 29, 'to': 23, 'down': 21, 'next': 19, 'is': 19, 'the': 11, 'in': 11, 'grass': 11, 'curb': 9, 'para': 8, 'on': 7, 'standing': 7, 'people': 7, 'men': 6, 'truck': 5, 'tennis': 5, 'skateboard': 5, 'racket': 4, 'white': 3, 'cows': 2, 'parked': 2, 'table': 2, 'dog': 2, 'and': 2, 'an': 2, 'two': 2, 'beach': 2, 'ze': 2, 'tree': 2, 'bat': 1, 'wooden': 1, 'court': 1, 'grassy': 1, 'area': 1, 'children': 1, 'z': 1, 'h': 1, 'birds': 1, 'stuffed': 1, 'animals': 1, 'plate': 1, 'g': 1, 'surf': 1, 'sk': 1, 'k': 1, 'flying': 1, 'sitting': 1, 'striped': 1, 'ele': 1})
{'a': 0.47276225168481517, 'green': 2.9766321021805715, 'truck': 5.671259282950641, 'next': 4.336258216218301, 'to': 4.145202979455592, 'curb': 5.083472618048522, 'man': 2.7920608256526016, 'walking': 3.9134013653982676, 'down': 4.236174757661319, 'the': 4.882801922586371, 'street': 3.784189633918261, 'with': 2.665576678543482, 'bat': 7.280697195384741, 'on': 5.

In [44]:
class CaptionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, captions):
        self.encodings = encodings
        self.captions = captions

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.captions[idx])
        return item

    def __len__(self):
        return len(self.captions)

In [45]:
# Assuming `dataset` is your dataset containing images and captions
images = [data['image'] for data in dataset]
caption_ids = generated_predicted_ids

# Process images and captions
inputs = feature_extractor(images=images, return_tensors="pt") 

# Create dataset and dataloader
train_dataset = CaptionDataset(inputs, caption_ids)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [46]:
class LoRALayer(nn.Module):
    def __init__(self, original_weight, rank):
        super(LoRALayer, self).__init__()
        self.original_weight = original_weight
        self.rank = rank
        self.U = nn.Parameter(torch.Tensor(self.original_weight.size(0), self.rank))
        self.V = nn.Parameter(torch.Tensor(self.rank, self.original_weight.size(1)))
        nn.init.xavier_uniform_(self.U)
        nn.init.xavier_uniform_(self.V)

    def forward(self):
        return self.original_weight + self.U @ self.V

In [47]:
# Modify the first attention layer of the encoder
# TODO: Try modifying other layers as well and check the results
lora_layers = []

with torch.no_grad():
    original_weight = model.encoder.encoder.layer[0].attention.output.dense.weight
    lora_layer = LoRALayer(original_weight, rank=10).forward()  # Choose an appropriate rank
    # assign the new layer to the model
    model.encoder.encoder.layer[0].attention.output.dense.weight.copy_(lora_layer)
    # add the layer of the model to the list of LoRA layers
    lora_layers.append(model.encoder.encoder.layer[0].attention.output.dense)

In [48]:
def is_lora_param(param, lora_layer):
    # check if the parameter is part of the LoRA layer
    print (lora_layer.parameters())
    print ("nuj")
    print (param)
    return param in lora_layer.parameters()


In [49]:
# add two extra dimensions to generated_logits
generated_probs = F.softmax(generated_logits, dim=-1)
generated_probs_expanded = generated_probs.unsqueeze(0).unsqueeze(0).to(device)

In [50]:
def calculate_entropy_elbo_difference (prob_differences, D):
    sigma = 0.01
    # reduce prob_differences to 4D from 5D by taking norm square along the last dimension
    prob_differences = prob_differences.to(device)
    prob_differences = torch.norm(prob_differences, dim=-1)
    print (prob_differences.shape)
    # do elementwise for prob_differences: suqare
    prob_differences = prob_differences**2
    # take sum of all elements of prob_differences, hence scalar, then divide by 2*sigma^2*D
    return torch.sum(prob_differences) / (2*sigma**2*D)

In [51]:
def calculate_entropy_elbo_cross_entropy (prob_differences, D):
    pass

In [52]:
def calculate_entropy (prob_differences, D):
    pass

In [53]:
def custom_loss(outputs, batch, encoding_dict, lora_layers, lambda_val=10, lora_lambda_val = 0.01):
    # Sending all to GPU
    outputs.logits = outputs.logits.to(device)
    outputs.loss = outputs.loss.to(device)
    
    # Standard captioning loss
    standard_loss = outputs.loss

    # Additional compression loss
    compression_loss = 0
    # add two dimensions to output probs at 2 and 3
    outputs_probs = F.softmax(outputs.logits, dim=-1)
    outputs_probs_expanded = outputs_probs.squeeze(1).unsqueeze(2).unsqueeze(3)
    prob_differences = generated_probs_expanded - outputs_probs_expanded
    print ("prob_differences.shape = ", outputs_probs.shape, generated_probs_expanded.shape, outputs_probs_expanded.shape, prob_differences.shape)
    # calculate the compression loss
    # find number of elements in generated_predicted_logits
    D = generated_probs.numel()
    compression_loss = lambda_val* calculate_entropy_elbo_difference (prob_differences, D)
    

    # Optionally, add a term for LoRA regularization if needed
    lora_regularization = 0
    # for param in model.parameters():
    #     for lora_layer in lora_layers:
    #         if is_lora_param(param, lora_layer):
    #             lora_regularization += torch.norm(param)
    print (standard_loss, compression_loss)

    return standard_loss + compression_loss + lora_lambda_val * lora_regularization

In [54]:
# Fine tuning using custom loss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

lr = 1e-4
num_epochs = 30

optimizer = AdamW([param for param in model.parameters() if param.requires_grad], lr=lr)

for epoch in range(num_epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = custom_loss(outputs, batch, encoding_dict, lora_layers)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.captions[idx])


: 

In [31]:
# create directory to save the model if it doesn't exist
if not os.path.exists("models"):
    os.mkdir("models")
# save model checkpoint to models directory using current timestamp and date
torch.save(model.state_dict(), f"models/{time.strftime('%Y%m%d-%H%M%S')}.pth")


In [32]:
# load latest model checkpoint among all the saved models
latest_model = torch.load(max(glob.glob('models/*.pth'), key=os.path.getctime))
# load the model with the latest checkpoint
model.load_state_dict(latest_model)
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [33]:
# Generate captions for the test dataset
generated_captions_custom_model = []
# Iterate over the dataset and generate captions
for data in dataset:
    image = data['image']
    generated_logits, generated_predicted_ids, caption = generate_caption_with_logits(image)
    generated_captions_custom_model.append(caption)


In [None]:
# Encode compressed dictionary word using manual huffman encoding

In [None]:
# Replace compressed_dict words occurring in the generated_captions_custom_model with their corresponding huffman encoding

In [None]:
# compare encoded generated_captions_custom_model + huffman encoding dictionary information with the original generated_captions to calculate compression ratio

In [34]:
# print generated_captions and generated_captions_custom_model elementwise to compare the results
for i in range(len(generated_captions)):
    print (generated_captions[i], "WAIT", generated_captions_custom_model[i])

a green truck parked next to a curb  WAIT a green truck next to a curb next to a
a man is walking down the street with a skate WAIT a man walking down the street with a a a
