In [None]:
import json
import nltk
import pickle
from collections import defaultdict
from typing import Dict, Tuple


In [None]:
def create_vocab_and_tokenize(annotation_file: str, threshold: int = 5) -> Tuple[Dict, Dict]:
    """
    Create word-to-index and index-to-word mappings and tokenize captions

    Args:
        annotation_file: Path to the training annotation file (e.g., captions_train2014.json)
        threshold: Minimum word count threshold

    Returns:
        word2idx: Dictionary mapping words to indices
        idx2word: Dictionary mapping indices to words
    """
    with open(annotation_file, 'r') as f:
        data = json.load(f)

    annotations = data['annotations']
    captions = [ann['caption'] for ann in annotations]

    word_counts = defaultdict(int)
    for caption in captions:
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        for token in tokens:
            word_counts[token] += 1

    words = [w for w, c in word_counts.items() if c >= threshold]

    word2idx = {w: i + 4 for i, w in enumerate(words)}
    word2idx['<pad>'] = 0
    word2idx['<start>'] = 1
    word2idx['<end>'] = 2
    word2idx['<unk>'] = 3

    idx2word = {i: w for w, i in word2idx.items()}

    print(f"Vocabulary size: {len(word2idx)}")

    return word2idx, idx2word

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

annotation_file = '/Users/seifmahdy/Desktop/Programming/Python/NLP/Final Project/annotations/captions_train2014.json'
word2idx, idx2word = create_vocab_and_tokenize(annotation_file)

with open('vocab.pkl', 'wb') as f:
    pickle.dump({'word2idx': word2idx, 'idx2word': idx2word}, f)

print("Vocabulary saved to vocab.pkl")

[nltk_data] Downloading package punkt to /Users/seifmahdy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/seifmahdy/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Vocabulary size: 8853
Vocabulary saved to vocab.pkl
