In [1]:
import tensorflow as tf
import numpy as np
import tensorflow.keras as keras
import pandas as pd
import os


2024-04-10 16:24:55.066203: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-10 16:24:55.088885: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# ###CONSTANTS
# DATA_PATH = "data/flickr30k_images/"
# IMAGES_PATH = "data/flickr30k_images/flickr30k_images/"
# IMAGE_SIZE=(224, 224)
# VAL_FRACTION=0.05
# SEQ_LENGTH=100
# BATCH_SIZE=64
# EPOCHS=20
# AUTOTUNE=tf.data.AUTOTUNE
# ###
from train_config import *

In [3]:
captionings_df = pd.read_csv(os.path.join(DATA_PATH, "results.csv"), sep="|")
captionings_df.columns = ["image_name", "comment_number", "comment"]
captionings_df.head(10)

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,Two young guys with shaggy hair look at their...
1,1000092795.jpg,1,"Two young , White males are outside near many..."
2,1000092795.jpg,2,Two men in green shirts are standing in a yard .
3,1000092795.jpg,3,A man in a blue shirt standing in a garden .
4,1000092795.jpg,4,Two friends enjoy time spent together .
5,10002456.jpg,0,Several men in hard hats are operating a gian...
6,10002456.jpg,1,Workers look down from up above on a piece of...
7,10002456.jpg,2,Two men working on a machine wearing hard hats .
8,10002456.jpg,3,Four men on top of a tall structure .
9,10002456.jpg,4,Three men on a large rig .


In [4]:
#Shuffle df
captionings_df = captionings_df.sample(frac=1,
                                       random_state=42,
                                       replace=False,
                                       )


n_train_examples = int(len(captionings_df) * (1 - VAL_FRACTION))

train_captionings_df = captionings_df[ : n_train_examples]
val_captionings_df = captionings_df[n_train_examples : ]

print("Train image-text examples: ", train_captionings_df.shape[0])
print("Validation image-text examples: ", val_captionings_df.shape[0])

#save splits
train_captionings_df.to_csv("splits/train_captions.csv", index=False)
val_captionings_df.to_csv("splits/val_captions.csv", index=False)

Train image-text examples:  150969
Validation image-text examples:  7946


In [5]:
train_captionings_df.columns

Index(['image_name', 'comment_number', 'comment'], dtype='object')

## Preprocessing

In [14]:
from data_processing import build_tokenizer, build_image_augmenter, process_input, make_dataset, decode_and_resize


tokenizer = build_tokenizer()
tokenizer.adapt(train_captionings_df["comment"].tolist())

In [15]:
print(tokenizer.get_vocabulary()[:100])

['', '[UNK]', 'a', 'in', 'the', 'on', 'and', 'man', 'is', 'of', 'with', 'woman', 'two', 'are', 'to', 'people', 'at', 'an', 'wearing', 'young', 'white', 'shirt', 'black', 'while', 'his', 'blue', 'red', 'girl', 'sitting', 'men', 'boy', 'dog', 'standing', 'playing', 'street', 'group', 'down', 'front', 'her', 'walking', 'holding', 'one', 'water', 'by', 'three', 'women', 'green', 'up', 'looking', 'child', 'as', 'for', 'little', 'large', 'outside', 'yellow', 'person', 'children', 'brown', 'through', 'from', 'their', 'hat', 'other', 'ball', 'small', 'into', 'next', 'over', 'some', 'dressed', 'out', 'running', 'another', 'building', 'jacket', 'riding', 'around', 'orange', 'near', 'field', 'crowd', 'stands', 'beach', 'background', 'pink', 'sidewalk', 'behind', 'jumping', 'table', 'girls', 'sits', 'grass', 'snow', 'bike', 'that', 'looks', 'top', 'camera', 'air']


In [16]:
physical_devices = tf.config.list_physical_devices('GPU')
physical_devices


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [17]:
tokenizer("a tall man with blue t-shirt and a dog")

<tf.Tensor: shape=(40,), dtype=int64, numpy=
array([  2, 407,   7,  10,  25, 185,   6,   2,  31,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0])>

In [18]:
def process_input(img_path, captions):
    return decode_and_resize(img_path), tokenizer(captions)


def make_dataset(images, captions):
    dataset = tf.data.Dataset.from_tensor_slices((images, captions))
    dataset = dataset.shuffle(BATCH_SIZE * 8)
    dataset = dataset.map(process_input, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)

    return dataset

In [19]:
process_input(os.path.join(IMAGES_PATH, "1000092795.jpg" ), "A man with a white dog")

(<tf.Tensor: shape=(224, 224, 3), dtype=float32, numpy=
 array([[[0.00585078, 0.02381562, 0.01875477],
         [0.00965832, 0.0196198 , 0.01569823],
         [0.0172462 , 0.02443735, 0.01678437],
         ...,
         [0.7029076 , 0.8511351 , 0.9051871 ],
         [0.8868041 , 0.98794574, 1.        ],
         [0.8786048 , 0.99592185, 1.        ]],
 
        [[0.00295392, 0.01079705, 0.00687548],
         [0.01572215, 0.02356529, 0.01964372],
         [0.01424195, 0.02391214, 0.01960394],
         ...,
         [0.8678015 , 0.9623651 , 0.9731852 ],
         [0.62791204, 0.83664644, 0.8352128 ],
         [0.8186369 , 0.96124005, 0.9427249 ]],
 
        [[0.03255731, 0.05216515, 0.03647888],
         [0.00538098, 0.01640164, 0.00357776],
         [0.0212921 , 0.02732662, 0.01683657],
         ...,
         [0.16157061, 0.35575992, 0.01498929],
         [0.11622956, 0.3528709 , 0.02961439],
         [0.3335867 , 0.598159  , 0.08154134]],
 
        ...,
 
        [[0.19922091, 0.25221053