Introducing manual compression of image captions on stale (offline) data

In [2]:
!pip install dahuffman


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.3-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Using cached GitPython-3.1.41-py3-none-any.whl (196 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.40.3-py2.py3-none-any.whl (257 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.8/257.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Using cached docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp39-cp39-macosx_10_9_x86_64.whl (11 kB)
Collecting appdirs>=1.4.3 (from wandb)
  Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)
  Using cached gitdb-4.0.11-py3-none-any.whl (62 kB)
Coll

In [1]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnikilravi10[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from transformers import AdamW
from datasets import load_dataset
import torch
import torch.optim as optim
from collections import Counter
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torch.nn.functional as F
import numpy as np
import os
import time
import glob
import logging
from dahuffman import HuffmanCodec

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the pre-trained model and its components
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [4]:
from datasets import list_datasets
len(list_datasets())

10000

In [5]:
# Load a dataset (for example, a subset of the COCO dataset)
# TODO: Potential datasets with repititive nature that can be used: MS COCO, Flickr30k, Visual Genome, SBU Captions 

# load small part of the coco dataset from all the .jpg images in datasets/mscoco/test2015
dataset = load_dataset("datasets/coco/images/test2017/", split="train[:25]")

Resolving data files: 100%|██████████| 40670/40670 [00:00<00:00, 222482.65it/s]
Using custom data configuration test2017-bcc41c24fb40aad9
Found cached dataset imagefolder (/Users/nikilravi/.cache/huggingface/datasets/imagefolder/test2017-bcc41c24fb40aad9/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


In [6]:
len(dataset)

25

In [7]:
def generate_caption(image, max_length=128):
    inputs = feature_extractor(images=image, return_tensors="pt")
    output_ids = model.generate(inputs["pixel_values"], max_length=max_length)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

In [8]:
# Iterate over the dataset and generate captions
max_length = 128
generated_captions = []

for data in dataset:
    image = data['image']
    caption = generate_caption(image, max_length=max_length)
    generated_captions.append(caption)

In [9]:
def update_encoding_dict(captions, encoding_dict):
    for caption in captions:
        words = caption.split()
        encoding_dict.update(words) # purpose of update is to add the words to the dictionary if they don't exist
    return encoding_dict

In [10]:
encoding_dict = Counter() # Counter is a subclass of dictionary for counting hashable objects
threshold = 2 # threshold for word frequency # TODO: find a good threshold

update_encoding_dict(generated_captions, encoding_dict)

# Optionally, create a more compressed form based on frequency
compressed_dict = {word: idx for idx, (word, freq) in enumerate(encoding_dict.items()) if freq > threshold}

In [11]:
generated_captions

['a green truck parked next to a curb ',
 'a baseball player swinging a bat at a ball ',
 'a cow is standing in a field of grass ',
 'a man is playing tennis on a clay court ',
 'a man and a woman playing a game of frisbee ',
 'a woman and a man are drinking wine ',
 'a zebra standing in a fenced in area ',
 'a horse grazing in a field with a tree ',
 'a bird perched on top of a bird feeder ',
 'a train on a track near a fence ',
 'an elephant with a large trunk standing on a dirt ground ',
 'a stuffed animal with a teddy bear on it ',
 'a plate of food with meat, broccoli and potatoes ',
 'a man in a suit and tie looking at his cell phone ',
 'a motorcycle parked on the side of a road ',
 'a bear walking through a forest with leaves ',
 'a plate of food on a table ',
 'a remote control sitting on top of a couch ',
 'a large jetliner flying through a cloudy sky ',
 'a man in a suit and tie speaking to a crowd ',
 'a plate of food with meat, rice and vegetables ',
 'a person jumping a s

In [12]:
encoding_dict

Counter({'a': 57,
         'on': 11,
         'of': 9,
         'with': 8,
         'and': 7,
         'in': 6,
         'man': 6,
         'standing': 4,
         'to': 3,
         'top': 3,
         'plate': 3,
         'food': 3,
         'parked': 2,
         'next': 2,
         'at': 2,
         'is': 2,
         'field': 2,
         'playing': 2,
         'woman': 2,
         'tree': 2,
         'bird': 2,
         'large': 2,
         'bear': 2,
         'meat,': 2,
         'suit': 2,
         'tie': 2,
         'through': 2,
         'green': 1,
         'truck': 1,
         'curb': 1,
         'baseball': 1,
         'player': 1,
         'swinging': 1,
         'bat': 1,
         'ball': 1,
         'cow': 1,
         'grass': 1,
         'tennis': 1,
         'clay': 1,
         'court': 1,
         'game': 1,
         'frisbee': 1,
         'are': 1,
         'drinking': 1,
         'wine': 1,
         'zebra': 1,
         'fenced': 1,
         'area': 1,
         'horse':

In [13]:
class CaptionAutoencoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, max_seq_length):
        super(CaptionAutoencoder, self).__init__()

        # Encoder
        self.encoder_embedding = nn.Embedding(vocab_size, embedding_dim) # input shape has to be (batch_size, sequence_length), output shape is (batch_size, sequence_length, embedding_dim)
        self.encoder_rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=4, batch_first=True) # output shape is (batch_size, sequence_length, hidden_dim)
        self.max_seq_length = max_seq_length

        # Decoder
        self.decoder_rnn = nn.GRU(hidden_dim, hidden_dim, num_layers=4, batch_first=True) # output shape is (batch_size, sequence_length, hidden_dim)
        self.decoder_output = nn.Linear(hidden_dim, vocab_size)

    def encode(self, captions):
        embedded = self.encoder_embedding(captions)
        encoded, _ = self.encoder_rnn(embedded)
        return encoded[:, -1, :]

    def decode(self, encoded):
        # Repeat the encoded state across the sequence length
        repeated_encoded = encoded.unsqueeze(1).repeat(1, self.max_seq_length, 1) 
        decoded, _ = self.decoder_rnn(repeated_encoded)
        output = self.decoder_output(decoded)
        return output

    def forward(self, captions):
        encoded = self.encode(captions)
        decoded = self.decode(encoded)
        return decoded


In [14]:
class CaptionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, captions):
        self.encodings = encodings
        self.captions = captions

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.captions[idx])
        return item

    def __len__(self):
        return len(self.captions)

In [15]:
# Assuming `dataset` is your dataset containing images and captions
images = [data['image'] for data in dataset]
captions = generated_captions

# Process images and captions
inputs = feature_extractor(images=images, return_tensors="pt")
outputs = tokenizer(captions, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Assuming 'captions' is a tensor of tokenized captions generated by VLM
vocab_size = tokenizer.vocab_size
embedding_dim = 50257
hidden_dim = 512
max_seq_length = 128
autoencoder = CaptionAutoencoder(vocab_size, embedding_dim, hidden_dim, max_seq_length)
autoencoder_output = autoencoder(outputs["input_ids"])

# Create dataset and dataloader
train_dataset = CaptionDataset(inputs, outputs["input_ids"])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [16]:
class LoRALayer(nn.Module):
    def __init__(self, original_weight, rank):
        super(LoRALayer, self).__init__()
        self.original_weight = original_weight
        self.rank = rank
        self.U = nn.Parameter(torch.Tensor(self.original_weight.size(0), self.rank))
        self.V = nn.Parameter(torch.Tensor(self.rank, self.original_weight.size(1)))
        nn.init.xavier_uniform_(self.U)
        nn.init.xavier_uniform_(self.V)

    def forward(self):
        return self.original_weight + self.U @ self.V

In [17]:
# Modify the first attention layer of the encoder
# TODO: Try modifying other layers as well and check the results
lora_layers = []

with torch.no_grad():
    original_weight = model.encoder.encoder.layer[0].attention.output.dense.weight
    lora_layer = LoRALayer(original_weight, rank=10).forward()  # Choose an appropriate rank
    # assign the new layer to the model
    model.encoder.encoder.layer[0].attention.output.dense.weight.copy_(lora_layer)
    # add the layer of the model to the list of LoRA layers
    lora_layers.append(model.encoder.encoder.layer[0].attention.output.dense)

In [18]:
def is_lora_param(param, lora_layer):
    # check if the parameter is part of the LoRA layer
    print (lora_layer.parameters())
    print ("nuj")
    print (param)
    return param in lora_layer.parameters()

In [19]:
def custom_loss(outputs, batch, lora_layers, autoencoder, standard_lambda_val = 1, lora_lambda_val = 0.01, compression_lambda_val = 0.01):
    # Standard captioning loss
    standard_loss = outputs.loss

    # Autoencoder compression reward
    captions = batch['labels']
    compressed_captions = autoencoder.encode(captions)
    # Measure the sparsity of the compressed representation (e.g., using L1 norm) # TODO: Try other measures
    compression_reward = torch.norm(compressed_captions, p=1)
    # Adjust the reward: lower norm (more sparse) should lead to lower loss (higher reward)
    compression_loss = compression_reward

    # Optionally, add a term for LoRA regularization if needed
    lora_regularization = 0
    # for param in model.parameters():
    #     for lora_layer in lora_layers:
    #         if is_lora_param(param, lora_layer):
    #             lora_regularization += torch.norm(param)
    return standard_lambda_val* standard_loss + compression_lambda_val * compression_loss + lora_lambda_val * lora_regularization

In [20]:
ae_criterion1 = nn.CrossEntropyLoss()

def ae_criterion2 (reconstructed_caption, original_caption, end_of_text_token_id):

    loss= 0

    for i in range(len(original_caption)):

        # remove all end of text tokens from the right in original caption
        trim_index = 0
        for j in range(len(original_caption[i])-1, -1, -1):
            if original_caption[i][j] != end_of_text_token_id:
                trim_index = j
                break
        trim_index += 1

        # Trim the trailing spaces
        trimmed_original = original_caption[i][:trim_index]
        trimmed_reconstructed = reconstructed_caption[i][:trim_index]

        # Calculate the loss (assuming cross-entropy loss)
        loss += ae_criterion1(trimmed_reconstructed, trimmed_original)

    return loss

In [21]:
run = wandb.init(
    # Set the project where this run will be logged
    project="vlm-compression",
    config={"dataset_size": "25"}
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [23]:
def train(num_epochs, vlm_lr, ae_lr, vlm_optimizer, ae_optimizer, device, save_every=5):
    for epoch in range(num_epochs):
        loop = tqdm(train_loader, leave=True)
        for batch in loop:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Fine tune VLM with custom loss
            # Forward pass
            model.zero_grad()
            outputs = model(**batch)
            vlm_loss = custom_loss(outputs, batch, lora_layers, autoencoder)
            # Backward pass and optimization
            vlm_optimizer.zero_grad()
            vlm_loss.backward()
            vlm_optimizer.step()

            # Train the autoencoder
            autoencoder.zero_grad()
            captions = batch['labels']
            compressed_captions = autoencoder.encode(captions)
            reconstructed_captions = autoencoder.decode(compressed_captions)
            # reconstructed_flat = reconstructed_captions.view(-1, reconstructed_captions.size(-1))
            # captions_flat = captions.view(-1)
            end_of_text_token_id = tokenizer.encode('<|endoftext|>')[0]
            ae_loss = ae_criterion2(reconstructed_captions, captions, end_of_text_token_id)
            ae_loss.backward()
            ae_optimizer.step()

            # TODO: change loss as combination of vlm_loss and ae_loss instead of individual losses

            # Update progress bar
            loop.set_description(f"Epoch {epoch}")
            loop.set_postfix(vlm_loss=vlm_loss.item(), ae_loss=ae_loss.item())

            wandb.log(
                {
                    "epoch": epoch,
                    "vlm_loss": vlm_loss,
                    "ae_loss": ae_loss,
                    "vlm_lr": vlm_lr,
                    "ae_lr": ae_lr
                }
            )

            if epoch % save_every == 0:
                # Save the model
                # create directory to save the model if it doesn't exist
                if not os.path.exists("auto_epoch_exp"):
                    os.mkdir("auto_epoch_exp")
                # save model checkpoint to models directory using current timestamp and date
                torch.save(model.state_dict(), f"auto_epoch_exp/{time.strftime('%Y%m%d-%H%M%S')}-{epoch}-{vlm_lr}-{ae_lr}-{vlm_loss}-{ae_loss}.pth")


In [24]:
num_epochs = 30
vlm_lrs = [5e-5, 3e-4]#[1e-4, 1e-3, 3e-4, 3e-3]
ae_lrs = [1e-3, 3e-4]#[1e-4, 1e-3, 3e-4, 3e-3]
for vlm_lr in vlm_lrs:
    for ae_lr in ae_lrs:
        vlm_optimizer = AdamW([param for param in model.parameters() if param.requires_grad], lr=vlm_lr)
        ae_optimizer = optim.Adam(autoencoder.parameters(), lr=ae_lr) 
        train(num_epochs, vlm_lr, ae_lr, vlm_optimizer, ae_optimizer, device)



  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.captions[idx])
Epoch 0: 100%|██████████| 1/1 [02:32<00:00, 152.38s/it, ae_loss=271, vlm_loss=18.3]
Epoch 1: 100%|██████████| 1/1 [02:34<00:00, 154.81s/it, ae_loss=267, vlm_loss=29.1]
Epoch 2: 100%|██████████| 1/1 [02:34<00:00, 154.60s/it, ae_loss=244, vlm_loss=58.9]
Epoch 3: 100%|██████████| 1/1 [02:29<00:00, 149.68s/it, ae_loss=216, vlm_loss=84.6]
Epoch 4: 100%|██████████| 1/1 [02:35<00:00, 155.47s/it, ae_loss=191, vlm_loss=98.5]
Epoch 5: 100%|██████████| 1/1 [02:41<00:00, 161.39s/it, ae_loss=170, vlm_loss=106]
Epoch 6: 100%|██████████| 1/1 [02:23<00:00, 143.47s/it, ae_loss=151, vlm_loss=111]
Epoch 7: 100%|██████████| 1/1 [02:34<00:00, 154.08s/it, ae_loss=136, vlm_loss=114]
Epoch 8: 100%|██████████| 1/1 [02:37<00:00, 157.40s/it, ae_loss=123, vlm_loss=116]
Epoch 9: 100%|██████████| 1/1 [02:33<00:00, 153.49s/it, ae_loss=113, vlm_loss=118]
Epoch 10: 100%|██████████| 1/1 [02:

KeyboardInterrupt: 

In [25]:
# create directory to save the model if it doesn't exist
if not os.path.exists("models_auto_compress_online_data"):
    os.mkdir("models_auto_compress_online_data")
# save model checkpoint to models directory using current timestamp and date
torch.save(model.state_dict(), f"models_auto_compress_online_data/{time.strftime('%Y%m%d-%H%M%S')}.pth")


In [21]:
# load latest model checkpoint among all the saved models
latest_model = torch.load(max(glob.glob('models_auto_compress_online_data/*.pth'), key=os.path.getctime))
# load the model with the latest checkpoint
model.load_state_dict(latest_model)

<All keys matched successfully>

In [26]:
# Generate captions for the test dataset
generated_captions_custom_model = []
generated_captions_custom_model_pre_compression = []
# Iterate over the dataset and generate captions
for data in dataset:
    image = data['image']
    # use autoencoder to encode and decode the caption
    caption = generate_caption(image)
    generated_captions_custom_model_pre_compression.append(caption)
    caption = tokenizer(caption, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    caption = caption['input_ids']
    caption = caption.to(device)
    print (caption)
    compressed_caption = autoencoder.encode(caption)
    compressed_caption = compressed_caption.to(device)
    # print (compressed_caption)
    reconstructed_caption = autoencoder.decode(compressed_caption)
    reconstructed_caption = reconstructed_caption.to(device)
    reconstructed_caption = reconstructed_caption.cpu()
    reconstructed_caption = reconstructed_caption.detach().numpy()
    reconstructed_caption = np.argmax(reconstructed_caption, axis=2)
    print (reconstructed_caption)
    reconstructed_caption = tokenizer.decode(reconstructed_caption[0], skip_special_tokens=True)
    generated_captions_custom_model.append(reconstructed_caption)

tensor([[   64,  4077,  7779, 19584,  1306,   284,   257, 20799,   220, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 5

In [27]:
generated_captions_custom_model_pre_compression

['a green truck parked next to a curb ',
 'a baseball player swinging a bat at a ball ',
 'a cow is standing in a field of grass ',
 'a man is playing tennis on a clay court ',
 'a man and a woman playing a game of frisbee ',
 'a woman and a man are drinking wine ',
 'a zebra standing in a fenced in area ',
 'a horse grazing in a field with a tree ',
 'a bird perched on top of a bird feeder ',
 'a train on a track near a fence ',
 'an elephant with a large trunk standing on a dirt ground ',
 'a stuffed animal with a teddy bear on it ',
 'a plate of food with meat, broccoli and potatoes ',
 'a man in a suit and tie looking at his cell phone ',
 'a motorcycle parked on the side of a road ',
 'a bear walking through a forest with leaves ',
 'a plate of food on a table ',
 'a remote control sitting on top of a couch ',
 'a large jetliner flying through a cloudy sky ',
 'a man in a suit and tie speaking to a crowd ',
 'a plate of food with meat, rice and vegetables ',
 'a person jumping a s

In [28]:
generated_captions_custom_model

['a man a a a a a                                                                                                                         ',
 'a man a a a a a                                                                                                                         ',
 'a man a a a a a                                                                                                                         ',
 'a man a a a a a                                                                                                                         ',
 'a man a a a a a                                                                                                                         ',
 'a man a a a a a                                                                                                                         ',
 'a man a a a a a                                                                                                                         ',
 'a man a a a

In [29]:
# Encode compressed dictionary word using manual huffman encoding
def huffman_code(frequencies):
    codec = HuffmanCodec.from_frequencies(frequencies)
    # encoded = codec.encode(('emnlp here we come').split())
    encoded = codec.encode(('a green truck parked next to a curb').split())
    decoded = codec.decode(encoded)
    # print("Encoded: ", encoded)
    # print("Decoded: ", decoded)
    codec.print_code_table()

    return codec

huffman_codec = huffman_code(encoding_dict)
huffman_codec.get_code_table()['horse']

Bits Code     Value Symbol
   7 0000000      0 'wave'
   7 0000001      1 'window'
   7 0000010      2 'wine'
   7 0000011      3 'zebra'
   6 000010       2 'woman'
   6 000011       3 'food'
   4 0001         1 'on'
   5 00100        4 'in'
   5 00101        5 'man'
   6 001100      12 'plate'
   6 001101      13 'to'
   5 00111        7 'and'
   2 01           1 'a'
   6 100000      32 'top'
   8 10000100   132 _EOF
   8 10000101   133 'an'
   8 10000110   134 'animal'
   8 10000111   135 'are'
   8 10001000   136 'area'
   8 10001001   137 'ball'
   7 1000101     69 'at'
   8 10001100   140 'baseball'
   8 10001101   141 'bat'
   7 1000111     71 'bear'
   8 10010000   144 'bed,'
   8 10010001   145 'bedroom'
   7 1001001     73 'bird'
   8 10010100   148 'broccoli'
   8 10010101   149 'cell'
   8 10010110   150 'chair,'
   8 10010111   151 'clay'
   8 10011000   152 'cloudy'
   8 10011001   153 'control'
   8 10011010   154 'couch'
   8 10011011   155 'court'
   8 10011100   156 '

(8, 178)

In [30]:
# Replace compressed_dict words occurring in the generated_captions_custom_model with their corresponding huffman encoding
thing_to_send = []
for caption in generated_captions_custom_model:
    symbols = caption.split()
    for symbol in symbols:
        if symbol in huffman_codec.get_code_table().keys():
            caption_new = caption.replace(symbol, str(huffman_codec.get_code_table()[symbol][1]))
    thing_to_send.append(caption_new)

In [31]:
print(generated_captions_custom_model)

['a man a a a a a                                                                                                                         ', 'a man a a a a a                                                                                                                         ', 'a man a a a a a                                                                                                                         ', 'a man a a a a a                                                                                                                         ', 'a man a a a a a                                                                                                                         ', 'a man a a a a a                                                                                                                         ', 'a man a a a a a                                                                                                                         ', 'a man a a a a a   

In [32]:
thing_to_send

['1 m1n 1 1 1 1 1                                                                                                                         ',
 '1 m1n 1 1 1 1 1                                                                                                                         ',
 '1 m1n 1 1 1 1 1                                                                                                                         ',
 '1 m1n 1 1 1 1 1                                                                                                                         ',
 '1 m1n 1 1 1 1 1                                                                                                                         ',
 '1 m1n 1 1 1 1 1                                                                                                                         ',
 '1 m1n 1 1 1 1 1                                                                                                                         ',
 '1 m1n 1 1 1

wandb: Network error (ConnectionError), entering retry loop.


In [None]:
# compare encoded generated_captions_custom_model + huffman encoding dictionary information with the original generated_captions to calculate compression ratio
# todo: this is wrong here, but correct for manual
compression_ratio = len(thing_to_send)/len(generated_captions_custom_model_pre_compression)

In [47]:
compression_ratio

1.0

In [33]:
# print generated_captions and generated_captions_custom_model elementwise to compare the results
for i in range(len(generated_captions)-1):
    print (generated_captions[i], generated_captions_custom_model_pre_compression[i], generated_captions_custom_model[i])

a green truck parked next to a curb  a green truck parked next to a curb  a green green parked a a a                                                                                                                         
a man is walking down the street with a skateboard  a man is walking down the street with a skateboard  a man is is a a a a a a                                                                                                                      
a baseball player swinging a bat at a ball  a baseball player swinging a bat at a ball  a man player a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a cow is standing in a field of grass  a cow is standing in a field of grass  a man is standing a a a a                                                                              

improve autoencoder architecture, use better semantic meaning preserving metric instead of simply cross entropy