In [1]:
# import distutils
import numpy as np
from pickle import load 
from pickle import dump
from PIL import Image 
import os
import string
import keras


In [10]:
def load_file_path(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

def map_image_to_captions(filename):
    file  = load_file_path(filename)
    lines  = file.strip().splitlines()
    image_to_caption = {}
    for line in lines:
        img, caption = line.split(',', 1)
        img = img.strip()
        caption = caption.strip()

        if img not in image_to_caption:
            image_to_caption[img] = [caption]
        else:
            image_to_caption[img].append(caption)
    return image_to_caption

def clean_captions(image_to_caption):
    translation_table = str.maketrans('', '', string.punctuation)

    for img, caps in image_to_caption.items():
        for i in range(len(caps)):
            img_caption = caps[i].replace("-", " ")
            tokens = img_caption.split()
            cleaned_words = [
                word.lower().translate(translation_table) for word in tokens
                if len(word) > 1 and word.isalpha()
            ]
            caps[i] = ' '.join(cleaned_words)
    return image_to_caption


def create_vocab(image_to_caption):
    vocab = set()
    for img, caps in image_to_caption.items():
        for caption in caps:
            vocab.update(caption.split()) 

    return vocab


def save_img_to_captions(image_to_caption, filename_to_save):
    lines = []
    for img, caps in image_to_caption.items():
        for caption in caps:
            lines.append(img + '\t' + caption)

    data = "\n".join(lines) 
    with open(filename_to_save, "w") as file:
        file.write(data)


In [11]:
dataset_text = r'C:\Users\user\Desktop\ML\Image_Caption_Generator\flickr8k\captions.txt'

image_to_caption = map_image_to_captions(dataset_text)
print("length of dictionary = " ,len(image_to_caption))

clean_texts = clean_captions(image_to_caption)

vocabulary = create_vocab(image_to_caption)
print('length of vocabulary =', len(vocabulary))

save_img_to_captions(clean_texts, 'img_caption.txt')

length of dictionary =  8092
length of vocabulary = 8405
