## Token Extractor 
This script just takes the captioned files, tokenizes them to keep the essential parts of speech and creates a new file with image name and tokens in the same directory. 

In [50]:
# function that creates tokens
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def extract_tokens(captions):
    # Extract specific tokens from each caption
    extracted_tokens = []
    for caption in captions:
        # Tokenize the caption
        tokens = word_tokenize(caption)

        # Tag each token with its part of speech
        tagged_tokens = pos_tag(tokens)

        # Extract specific tokens based on their part of speech
        nouns = [token[0] for token in tagged_tokens if token[1].startswith("N")]
        verbs = [token[0] for token in tagged_tokens if token[1].startswith("V")]
        adjectives = [token[0] for token in tagged_tokens if token[1].startswith("J")]

        # Combine extracted tokens into meaningful phrases
        noun_phrases = []
        current_phrase = []
        for token in tagged_tokens:
            if token[1].startswith("N"):
                current_phrase.append(token[0])
            elif current_phrase:
                # Combine consecutive nouns into noun phrases
                noun_phrases.append(" ".join(current_phrase))
                current_phrase = []
        if current_phrase:
            noun_phrases.append(" ".join(current_phrase))

        verb_phrases = verbs

        adjective_phrases = []
        current_phrase = []
        for token in tagged_tokens:
            if token[1].startswith("J"):
                current_phrase.append(token[0])
            elif current_phrase:
                # Combine consecutive adjectives into adjective phrases
                adjective_phrases.append(" ".join(current_phrase))
                current_phrase = []
        if current_phrase:
            adjective_phrases.append(" ".join(current_phrase))

        # Combine all extracted phrases into a single list of tokens
        extracted_tokens.append(list(set(noun_phrases + verbs + adjective_phrases)))
        
    return extracted_tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rubin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rubin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


First we import the image caption data as-is

In [51]:
# This part just imports and cleans the captions
import os, sys, string

captions = "8kcaptions.txt"

with open(os.path.join(sys.path[0], captions), "r", encoding="utf8") as f:
    image_data = f.readlines()

if "8k" in captions:
    sep = ','
elif "30k" in captions:
    sep = '\t'
    
# separate image names and captions (for 8k)
im_names = [line.split(sep)[0].strip() for line in image_data]
im_captions = [line.split(sep)[1].strip() for line in image_data]

Next, we get isolate the i-th caption for each image. Each image has 5 captions. **WE DON'T DO THIS ANYMORE - the model is trained on all 5 captions**

In [52]:
# # get only captions in i_idx index for each image
# i_idx = 0
# im_names = im_names[i_idx+1::5]
# im_captions = im_captions[i_idx+1::5]

# remove all punctuation from the strings
im_captions = [caption.translate(str.maketrans('', '', string.punctuation)) for caption in im_captions]

Then we feed the captions into the tokenizer

In [53]:
extracted_tokens = extract_tokens(im_captions)

Finally, we store it back in the same directory with the names and tokens

In [54]:
with open('tokenized_' + captions, 'w') as f:
    for i in range(len(extracted_tokens)):
        f.write(im_names[i] +
                ',' +
                ' '.join(extracted_tokens[i]) + 
                '\n')