Introducing manual compression of image captions on stale (offline) data

In [1]:
%pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer, BitsAndBytesConfig, AutoProcessor, LlavaForConditionalGeneration
from transformers import AdamW
from datasets import load_dataset
import torch
from torch.cuda.amp import autocast, GradScaler
from collections import Counter
import fiftyone
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torch.nn.functional as F
import numpy as np
import os
import time
import glob

# uncommon features  - events of interest
# loss less compression -  sudden more bits indicates anomaly can be flagged, alerts when anomaly detected - may shift to lossy video streaming
# lossy compression of noisy data varying distortion rate - accuracy is increasing
# video to video lossy reconstruction possibility
# image frame to image frame on a need basis - human satisfaction metric, GPT based comparison, RLHF based comparison

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Load the pre-trained model and its components
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# # Loading the above for LlavVA
# model_llava = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
# processor_llava = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")



In [5]:
# Load a dataset (for example, a subset of the COCO dataset)
# TODO: Potential datasets with repetitive nature that can be used: MS COCO, Flickr30k, Visual Genome, SBU Captions - get correlated datasets from Nikil

# load small part of the coco dataset from all the .jpg images in datasets/mscoco/test2015
dataset_dict = load_dataset("datasets/mscoco/test2015/", split="test[:100]")
dataset_finetune = load_dataset("datasets/mscoco/test2015/", split="test[:200]")
# TODO: Determine if having same images for dictionary and fine tuning helps, or overlap or completely different images help

Resolving data files: 100%|██████████| 81434/81434 [00:00<00:00, 1036333.54it/s]
Resolving data files: 100%|██████████| 81434/81434 [00:00<00:00, 1101813.73it/s]


In [8]:
# TODO: use different values of max_length and try out results

def generate_caption_with_logits(image, max_length=15):
    # define device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Prepare the inputs
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)
    pixel_values = inputs.pixel_values
    
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        # Perform a forward pass to get the logits
        encoder_outputs = model.encoder(pixel_values=pixel_values)
        encoder_hidden_states = encoder_outputs.last_hidden_state
        
        # Prepare decoder input_ids. Typically, you start with the start-of-sentence token
        decoder_input_ids = torch.tensor([tokenizer.bos_token_id]).unsqueeze(0).to(encoder_hidden_states.device)
        decoder_attention_mask = torch.ones_like(decoder_input_ids)
        
        # Initialize an empty tensor for logits (for simplicity, accumulating logits for each step)
        logits_list = []
        
        for i in range(max_length):
            decoder_outputs = model.decoder(input_ids=decoder_input_ids,
                                            attention_mask=decoder_attention_mask,
                                            encoder_hidden_states=encoder_hidden_states)
            logits = decoder_outputs.logits[:, -1, :]  # Get the logits for the last token generated
            logits_list.append(logits)
            
            predicted_id = torch.argmax(logits, dim=-1).unsqueeze(-1)
            # Check if EOS token is generated
            if predicted_id[0, 0] == tokenizer.eos_token_id:
                print ("EOS has been generated")
                # break # since model.generate() does this automatically
            
            # Append predicted token ID to decoder_input_ids for generating next token
            decoder_input_ids = torch.cat([decoder_input_ids, predicted_id], dim=-1)
            decoder_attention_mask = torch.cat([decoder_attention_mask, torch.ones_like(predicted_id).to(device)], dim=-1)
            
        # Concatenate logits from each step to get the final logits tensor
        # make all elements of logits_list 3D by adding a dimension in the middle
        logits_list = [logits.unsqueeze(1) for logits in logits_list]
        logits = torch.cat(logits_list, dim=1)
        # add logic to repeat the remaining number of (127-i) tokens with EOS token logits (simply repeat the last token logits) to make it length 128

        # Decode the generated token IDs to get the caption
        predicted_ids = torch.argmax(logits, dim=-1)
        caption = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
        
    return logits, predicted_ids, caption

# Example usage
# image: A PIL image or a tensor representing your input image
# logits, predicted_ids, caption = generate_caption_with_logits(image, model, feature_extractor, tokenizer)


In [9]:
def generate_captions_logits_ids_from_dataset(dataset_dict):
    # Iterate over the dataset and generate captions
    generated_captions = []
    generated_logits = []
    generated_predicted_ids = []

    for data in dataset_dict:
        image = data['image']
        logits, predicted_ids, caption = generate_caption_with_logits(image)
        generated_captions.append(caption)
        generated_logits.append(logits)
        generated_predicted_ids.append(predicted_ids)

    # concatenate generated logits along first dimension to make 3D tensor
    generated_logits = torch.cat(generated_logits, dim=0)
    print (generated_logits.shape)

    # concatenate generated predicted_ids along first dimension to make 2D tensor
    generated_predicted_ids = torch.cat(generated_predicted_ids, dim=0)

    # form new tensor of size unique tokens * vocab size : each row is the means of logits of all tokens with that id\n",
    # find unique elements in generated_predicted_ids, result should be a 1D tensor\n",
    unique_tokens = torch.unique(generated_predicted_ids)
    # store torch tensor of the number of times each unique token appears in generated_predicted_ids -  remember generated_predicted_ids is a 2D tensor
    unique_token_counts = torch.zeros(unique_tokens.shape[0]).to(device)
    for i, token in enumerate(unique_tokens):
        unique_token_counts[i] = torch.sum(generated_predicted_ids == token)
    unique_token_probs = unique_token_counts / torch.sum(unique_token_counts)
    print (unique_token_counts.shape)
    # find the mean of logits for each unique token
    mean_logits = torch.zeros(unique_tokens.shape[0], generated_logits.shape[2]).to(device)
    for i, token in enumerate(unique_tokens):
        mean_logits[i] = torch.mean(generated_logits[generated_predicted_ids == token], dim=0)
    print (mean_logits.shape)
    return generated_captions, generated_logits, generated_predicted_ids, unique_tokens, unique_token_probs, unique_token_counts, mean_logits

EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has bee

In [10]:
generated_captions_dict, generated_logits_dict, generated_predicted_ids_dict, unique_tokens_dict, unique_token_probs_dict, unique_token_counts_dict, mean_logits_dict = generate_captions_logits_ids_from_dataset(dataset_dict)
generated_captions_finetune, generated_logits_finetune, generated_predicted_ids_finetune, unique_tokens_finetune, unique_token_probs_finetune, unique_token_counts_finetune, mean_logits_finetune = generate_captions_logits_ids_from_dataset(dataset_finetune)

['a green truck parked next to a curb a green truck parked next',
 'a man is walking down the street with a skateboard a man',
 'a baseball player swinging a bat at a ball a baseball player swinging',
 'a cow is standing in a field of grass a cow is standing',
 'a black dog sitting in the back of a truck a black dog',
 'a man wearing a bow tie and glasses a man wearing a bow',
 'a dining room table with a large bowl of food a large kitchen',
 'a man standing next to a wall with a bunch of guitars a',
 'a man is playing tennis on a clay court a man is playing',
 'a man and a woman playing a game of frisbee a',
 'a woman and a man are drinking wine people are sitting at a',
 'people are standing around a table people are drinking wine people',
 'a zebra standing in a fenced in area a zebra',
 'two dogs are looking at each other in a room a dog is',
 'a horse grazing in a field with a tree a horse grazing in',
 'a bird perched on top of a bird feeder a bird sitting',
 'a train on a track 

In [11]:
def update_encoding_dict(captions, encoding_dict):
    for caption in captions:
        words = caption.split() # splitting the caption into words - pretty bad strategy since we are currently splitting into tokens
        encoding_dict.update(words) # purpose of update is to add the words to the dictionary if they don't exist
    return encoding_dict

In [12]:
encoding_dict = Counter() # Counter is a subclass of dictionary for counting hashable objects
threshold = 0 # threshold for word frequency # TODO: find a good threshold

update_encoding_dict(generated_captions, encoding_dict)

print (encoding_dict)

# Optionally, create a more compressed form based on frequency
compressed_dict = {word: idx for idx, (word, freq) in enumerate(encoding_dict.items()) if freq > threshold}

# Create the dictionary of entropy values from encoding_dict
entropy_dict = {word: -np.log(encoding_dict[word] / sum(encoding_dict.values())) 
                for word in encoding_dict}

print (entropy_dict)
# print 1/elem for elem in encoding_dict.values()
reciprocal_dict = {word: 1/(encoding_dict[word]+1) for word in encoding_dict}
print (reciprocal_dict)

Counter({'a': 329, 'on': 49, 'of': 42, 'man': 41, 'with': 40, 'in': 36, 'is': 25, 'sitting': 24, 'standing': 18, 'and': 18, 'people': 18, 'woman': 17, 'are': 14, 'at': 12, 'large': 11, 'food': 10, 'tennis': 10, 'the': 9, 'playing': 9, 'street': 8, 'top': 8, 'truck': 7, 'parked': 7, 'next': 7, 'to': 7, 'black': 7, 'holding': 7, 'riding': 7, 'field': 6, 'room': 6, 'table': 6, 'two': 6, 'plate': 6, 'person': 6, 'bear': 6, 'fire': 6, 'cat': 6, 'walking': 5, 'down': 5, 'dog': 5, 'wearing': 5, 'tie': 5, 'tree': 5, 'train': 5, 'laptop': 5, 'clock': 5, 'laying': 5, 'court': 4, 'zebra': 4, 'an': 4, 'it': 4, 'through': 4, 'jetliner': 4, 'building': 4, 'dress': 4, 'skateboard': 3, 'swinging': 3, 'ball': 3, 'bowl': 3, 'area': 3, 'looking': 3, 'horse': 3, 'grazing': 3, 'bird': 3, 'elephant': 3, 'teddy': 3, 'couch': 3, 'motorcycle': 3, 'side': 3, 'road': 3, 'bed': 3, 'herd': 3, 'flying': 3, 'sky': 3, "it's": 3, 'tower': 3, 'light': 3, 'computer': 3, 'mouse': 3, 'filled': 3, 'green': 2, 'baseball': 2

In [14]:
class CaptionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, captions):
        self.encodings = encodings
        self.captions = captions

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.captions[idx])
        return item

    def __len__(self):
        return len(self.captions)

In [15]:
# Assuming `dataset` is your dataset containing images and captions
images = [data['image'] for data in dataset_finetune]
caption_ids = generated_predicted_ids_finetune

# Process images and captions
inputs = feature_extractor(images=images, return_tensors="pt") 

# Create dataset and dataloader
train_dataset = CaptionDataset(inputs, caption_ids)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [16]:
class LoRALayer(nn.Module):
    def __init__(self, original_weight, rank):
        super(LoRALayer, self).__init__()
        self.original_weight = original_weight
        self.rank = rank
        self.device = original_weight.device
        self.U = nn.Parameter(torch.Tensor(self.original_weight.size(0), self.rank)).to(self.device)
        self.V = nn.Parameter(torch.Tensor(self.rank, self.original_weight.size(1))).to(self.device)
        nn.init.xavier_uniform_(self.U)
        nn.init.xavier_uniform_(self.V)

    def forward(self):
        return self.original_weight + self.U @ self.V

In [17]:
# Modify the first attention layer of the encoder
# TODO: Try modifying other layers as well and check the results
lora_layers = []

with torch.no_grad():
    original_weight = model.encoder.encoder.layer[0].attention.output.dense.weight
    lora_layer = LoRALayer(original_weight, rank=10).to(device).forward()  # Choose an appropriate rank
    # assign the new layer to the model
    model.encoder.encoder.layer[0].attention.output.dense.weight.copy_(lora_layer)
    # add the layer of the model to the list of LoRA layers
    lora_layers.append(model.encoder.encoder.layer[0].attention.output.dense)

In [18]:
def is_lora_param(param, lora_layer):
    # check if the parameter is part of the LoRA layer
    print (lora_layer.parameters())
    print ("nuj")
    print (param)
    return param in lora_layer.parameters()


In [19]:
# add two extra dimensions to generated_logits
generated_probs_dict = F.softmax(mean_logits_dict, dim=-1)
generated_probs_dict_expanded = generated_probs_dict.unsqueeze(0).unsqueeze(0).to(device)

In [20]:
def calculate_entropy_elbo_difference (prob_differences, unique_token_counts, D):
    sigma = 0.01
    # reduce prob_differences to 4D from 5D by taking norm square along the last dimension
    prob_differences = prob_differences.to(device)
    prob_differences = torch.norm(prob_differences, dim=-1)
    print (prob_differences.shape)
    # do elementwise for prob_differences: suqare
    prob_differences = prob_differences**2
    # multiply i,j,k th element of prob_differences with k th element of unique_token_probs
    prob_differences = prob_differences * unique_token_counts
    # take sum of all elements of prob_differences, hence scalar, then divide by 2*sigma^2*D
    return torch.sum(prob_differences) / (2*sigma**2*D)

In [21]:
def calculate_entropy_elbo_cross_entropy (prob_differences, D):
    pass

In [22]:
def calculate_entropy (prob_differences, D):
    pass

In [23]:
def custom_loss(outputs, batch, encoding_dict, lora_layers, lambda_val=0.05, lora_lambda_val = 0.01):
    # Sending all to GPU
    outputs.logits = outputs.logits.to(device)
    outputs.loss = outputs.loss.to(device)
    
    # Standard captioning loss
    standard_loss = outputs.loss

    # Additional compression loss
    compression_loss = 0
    # add two dimensions to output probs at 2 and 3
    outputs_probs = F.softmax(outputs.logits, dim=-1)
    outputs_probs_expanded = outputs_probs.squeeze(1).unsqueeze(2).to(device)
    # print shapes of generated_probs_expanded and outputs_probs_expanded
    print ("generated_probs_expanded.shape = ", generated_probs_dict_expanded.shape, outputs_probs_expanded.shape)
    prob_differences = generated_probs_dict_expanded - outputs_probs_expanded
    print ("prob_differences.shape = ", outputs_probs.shape, generated_probs_dict_expanded.shape, outputs_probs_expanded.shape, prob_differences.shape)
    # calculate the compression loss
    # find number of elements in generated_predicted_logits
    D = generated_probs_dict.numel()
    compression_loss = lambda_val* calculate_entropy_elbo_difference (prob_differences, unique_token_counts_dict, D)
    

    # Optionally, add a term for LoRA regularization if needed
    lora_regularization = 0
    # for param in model.parameters():
    #     for lora_layer in lora_layers:
    #         if is_lora_param(param, lora_layer):
    #             lora_regularization += torch.norm(param)
    print (standard_loss, compression_loss)

    return standard_loss + compression_loss + lora_lambda_val * lora_regularization

In [24]:
# Fine tuning using custom loss

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

lr = 1e-4
num_epochs = 2

optimizer = AdamW([param for param in model.parameters() if param.requires_grad], lr=lr)
scaler = GradScaler()

for epoch in range(num_epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        with autocast():
            # Forward pass
            outputs = model(**batch)
            loss = custom_loss(outputs, batch, encoding_dict, lora_layers)

            # Backward pass and optimization
            optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Update progress bar
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.captions[idx])
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


generated_probs_expanded.shape =  torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257])
prob_differences.shape =  torch.Size([16, 15, 50257]) torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257]) torch.Size([16, 15, 287, 50257])
torch.Size([16, 15, 287])
tensor(0.9278, grad_fn=<NllLossBackward0>) tensor(5.8234, grad_fn=<MulBackward0>)


Epoch 0:  14%|█▍        | 1/7 [03:08<18:50, 188.48s/it, loss=6.75]

generated_probs_expanded.shape =  torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257])
prob_differences.shape =  torch.Size([16, 15, 50257]) torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257]) torch.Size([16, 15, 287, 50257])
torch.Size([16, 15, 287])
tensor(1.2973, grad_fn=<NllLossBackward0>) tensor(5.1359, grad_fn=<MulBackward0>)


Epoch 0:  29%|██▊       | 2/7 [06:12<15:30, 186.08s/it, loss=6.43]

generated_probs_expanded.shape =  torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257])
prob_differences.shape =  torch.Size([16, 15, 50257]) torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257]) torch.Size([16, 15, 287, 50257])
torch.Size([16, 15, 287])
tensor(1.3930, grad_fn=<NllLossBackward0>) tensor(4.9709, grad_fn=<MulBackward0>)


Epoch 0:  43%|████▎     | 3/7 [08:56<11:43, 175.80s/it, loss=6.36]

generated_probs_expanded.shape =  torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257])
prob_differences.shape =  torch.Size([16, 15, 50257]) torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257]) torch.Size([16, 15, 287, 50257])
torch.Size([16, 15, 287])
tensor(1.5346, grad_fn=<NllLossBackward0>) tensor(4.7530, grad_fn=<MulBackward0>)


Epoch 0:  57%|█████▋    | 4/7 [11:47<08:42, 174.05s/it, loss=6.29]

generated_probs_expanded.shape =  torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257])
prob_differences.shape =  torch.Size([16, 15, 50257]) torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257]) torch.Size([16, 15, 287, 50257])
torch.Size([16, 15, 287])
tensor(1.5586, grad_fn=<NllLossBackward0>) tensor(4.7234, grad_fn=<MulBackward0>)


Epoch 0:  71%|███████▏  | 5/7 [14:35<05:43, 171.76s/it, loss=6.28]

generated_probs_expanded.shape =  torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257])
prob_differences.shape =  torch.Size([16, 15, 50257]) torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257]) torch.Size([16, 15, 287, 50257])
torch.Size([16, 15, 287])
tensor(1.7431, grad_fn=<NllLossBackward0>) tensor(4.4591, grad_fn=<MulBackward0>)


Epoch 0:  86%|████████▌ | 6/7 [17:39<02:55, 175.85s/it, loss=6.2] 

generated_probs_expanded.shape =  torch.Size([1, 1, 287, 50257]) torch.Size([4, 15, 1, 50257])
prob_differences.shape =  torch.Size([4, 15, 50257]) torch.Size([1, 1, 287, 50257]) torch.Size([4, 15, 1, 50257]) torch.Size([4, 15, 287, 50257])
torch.Size([4, 15, 287])
tensor(1.8746, grad_fn=<NllLossBackward0>) tensor(1.0626, grad_fn=<MulBackward0>)


Epoch 0: 100%|██████████| 7/7 [17:47<00:00, 152.50s/it, loss=2.94]
  0%|          | 0/7 [00:00<?, ?it/s]

generated_probs_expanded.shape =  torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257])
prob_differences.shape =  torch.Size([16, 15, 50257]) torch.Size([1, 1, 287, 50257]) torch.Size([16, 15, 1, 50257]) torch.Size([16, 15, 287, 50257])
torch.Size([16, 15, 287])
tensor(1.8659, grad_fn=<NllLossBackward0>) tensor(4.2316, grad_fn=<MulBackward0>)


In [None]:
# create directory to save the model if it doesn't exist
if not os.path.exists("models"):
    os.mkdir("models")
# save model checkpoint to models directory using current timestamp and date
torch.save(model.state_dict(), f"models/{time.strftime('%Y%m%d-%H%M%S')}.pth")


In [None]:
# load latest model checkpoint among all the saved models
latest_model = torch.load(max(glob.glob('models/*.pth'), key=os.path.getctime))
# load the model with the latest checkpoint
model.load_state_dict(latest_model)
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [None]:
# Generate captions for the test dataset
generated_captions_custom_model = []
# Iterate over the dataset and generate captions
for data in dataset_finetune:
    image = data['image']
    generated_logits, generated_predicted_ids, caption = generate_caption_with_logits(image)
    generated_captions_custom_model.append(caption)


EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated
EOS has been generated


In [None]:
# Encode compressed dictionary word using manual huffman encoding

In [None]:
# Replace compressed_dict words occurring in the generated_captions_custom_model with their corresponding huffman encoding

In [None]:
# compare encoded generated_captions_custom_model + huffman encoding dictionary information with the original generated_captions to calculate compression ratio

In [None]:
# print generated_captions and generated_captions_custom_model elementwise to compare the results
for i in range(len(generated_captions_finetune)):
    print (generated_captions_finetune[i], "WAIT", generated_captions_custom_model[i])
# print the frequency of each word in generated_captions
print (encoding_dict)


a green truck parked next to a curb a green truck parked next WAIT a man is sitting on a man sitting on a man sitting on a man
a man is walking down the street with a skateboard a man WAIT a man is playing a man playing a man playing a man playing a man
a baseball player swinging a bat at a ball a baseball player swinging WAIT a man is playing a man playing a man playing a man playing a man
a cow is standing in a field of grass a cow is standing WAIT a man is playing a man playing a man playing a man playing a man
a black dog sitting in the back of a truck a black dog WAIT a man is sitting on a man sitting on a man sitting on a man
a man wearing a bow tie and glasses a man wearing a bow WAIT a man is a man is a man is a man is a man is
a dining room table with a large bowl of food a large kitchen WAIT a man is sitting on a man sitting on a man sitting on a man
a man standing next to a wall with a bunch of guitars a WAIT a man is a man is a man is a man is a man is
a man is playing tenn