In [1]:
!pip install einops -qq
!pip install tensorflow_datasets -qq
!pip install tensorflow-text -qq

In [2]:
import tensorflow as tf
import tensorflow_text as tf_text
import tensorflow_datasets as tfds

import numpy as np
import matplotlib.pyplot as plt
import os
import pathlib
import collections
import time
import string
import re
import einops
from tqdm.auto import tqdm
from PIL import Image
import nltk
from nltk.util import ngrams
import zipfile

In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/saurabh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def get_data(path='../dataset/flickr8k'):
    """
    Downloads and extracts the Flickr8k dataset and text files.

    Parameters:
        path (str): Path where the dataset will be stored.
    """
    path = pathlib.Path(path)
    
    # Download and extract the Flickr8k Dataset
    tf.keras.utils.get_file(
        origin='https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip',
        cache_dir='.',
        cache_subdir=path,
        extract=True
    )
    
    # Download and extract the Flickr8k text files
    tf.keras.utils.get_file(
        origin='https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip',
        cache_dir='.',
        cache_subdir=path,
        extract=True
    )

def get_dataset(path='../dataset/flickr8k/Flickr8k_text.zip'):
    """
    Reads and processes the Flickr8k dataset to create training and testing datasets.

    Parameters:
        path (str): Path where the dataset is stored.

    Returns:
        tuple: A tuple of training and testing datasets.
    """
    path = pathlib.Path(path)
    
    # Read and process captions
    captions = (path / 'Flickr8k.token.txt').read_text().splitlines()
    captions = [cap.split('\t') for cap in captions]
    captions = [(img_path.split('#')[0], cap) for (img_path, cap) in captions]
    
    # Create a dictionary of image paths and their respective captions
    cap_dict = collections.defaultdict(list)
    for img_path, cap in captions:
        cap_dict[img_path].append(cap)
    
    # Read training and testing image paths
    train_imgs_path = (path / 'Flickr_8k.trainImages.txt').read_text().splitlines()
    test_imgs_path = (path / 'Flickr_8k.testImages.txt').read_text().splitlines()
    
    # Create training and testing datasets
    train_caps = [
        (str(path / 'Flicker8k_Dataset' / img_path), cap_dict[img_path]) 
        for img_path in train_imgs_path
    ]
    test_caps = [
        (str(path / 'Flicker8k_Dataset' / img_path), cap_dict[img_path]) 
        for img_path in test_imgs_path
    ]
    
    # Convert to TensorFlow datasets
    train_raw = tf.data.experimental.from_list(train_caps)
    test_raw = tf.data.experimental.from_list(test_caps)

    return train_raw, test_raw

In [5]:
get_data()
train_raw, test_raw = get_dataset() # Use path='flickr8k' if on windows

In [6]:
print(len(train_raw), len(test_raw))
print(train_raw.element_spec)

for img_path, captions in train_raw.take(1):
    break

print(img_path)
print(captions)

6000 1000
(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(5,), dtype=tf.string, name=None))
tf.Tensor(b'../dataset/flickr8k/Flickr8k_text.zip/Flicker8k_Dataset/2513260012_03d33305cf.jpg', shape=(), dtype=string)
tf.Tensor(
[b'A black dog is running after a white dog in the snow .'
 b'Black dog chasing brown dog through snow'
 b'Two dogs chase each other across the snowy ground .'
 b'Two dogs play together in the snow .'
 b'Two dogs running through a low lying body of water .'], shape=(5,), dtype=string)
