In [1]:
import json
import torch
from torchvision.datasets import CocoCaptions
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

def training_tokenization_vocab_build(train_file_name, train_save_file_name):
    with open(train_file_name, 'r') as f:
        all_captions = json.load(f)

    tokenizer = get_tokenizer('basic_english')
    encoded_captions = []
    for caption in all_captions.keys():
        encoded_captions.append(tokenizer(caption.lower()))
    
    # Used to find max_length; after finding, hard code it in
    # max_length = 0
    # for tokenized in encoded_captions:
    #     if len(tokenized) > max_length:
    #         max_length = len(tokenized)
    
    max_length = 57

    vocab = build_vocab_from_iterator(encoded_captions, specials=['<unk>', '<pad>', '<start>', '<end>'])
    vocab.set_default_index(vocab['<unk>'])
    print(len(vocab))
    tokenized = [torch.tensor([vocab['<start>']] + [vocab[token] for token in caption] + [vocab['<end>']] + [vocab['<pad>'] for _ in range(max_length - len(caption))]) for caption in encoded_captions]
    torch.save(tokenized, train_save_file_name)
    return vocab

def tokenize(file_name, save_name, max_length, vocab):
    with open(file_name, 'r') as f:
        all_captions = json.load(f)
    
    tokenizer = get_tokenizer('basic_english')
    encoded_captions = []
    for caption in all_captions.keys():
        encoded_captions.append(tokenizer(caption.lower()))

    tokenized = [torch.tensor([vocab['<start>']] + [vocab[token] for token in caption] + [vocab['<end>']] + [vocab['<pad>'] for _ in range(max_length - len(caption))]) for caption in encoded_captions]

    torch.save(tokenized, save_name)

vocab = training_tokenization_vocab_build('./train_caption2filename.json','train_tokenized_captions.pt')
tokenize('./val_caption2filename.json', 'val_tokenized_captions.pt', 57, vocab)     # Max length: 56
tokenize('./test_caption2filename.json', 'test_tokenized_captions.pt', 57, vocab)   # Max length: 54

# max_length is 57 for COCO, but only for the text. We have to add on start and end tokens, so 
# the max_length to use when designing a neural network is max_length + 2 when looking 
# at the text input tensor

  _torch_pytree._register_pytree_node(


24784
