In [10]:
import tensorflow as tf
import numpy as np
import tensorflow.keras as keras
import pandas as pd
import os


In [27]:
# ###CONSTANTS
# IMAGE_SIZE=(224, 224)
# VAL_FRACTION=0.05
# SEQ_LENGTH=100
# BATCH_SIZE=64
# EPOCHS=20
# AUTOTUNE=tf.data.AUTOTUNE
# ###
from train_config import *

In [28]:
captionings_df = pd.read_csv(os.path.join(DATA_PATH, "results.csv"), sep="|").dropna()
captionings_df.columns = ["image_name", "comment_number", "comment"]
captionings_df["image_name"] = IMAGES_PATH + "/" + captionings_df["image_name"] 

captionings_df.head(10)

Unnamed: 0,image_name,comment_number,comment
0,data/flickr30k_images/flickr30k_images//100009...,0,Two young guys with shaggy hair look at their...
1,data/flickr30k_images/flickr30k_images//100009...,1,"Two young , White males are outside near many..."
2,data/flickr30k_images/flickr30k_images//100009...,2,Two men in green shirts are standing in a yard .
3,data/flickr30k_images/flickr30k_images//100009...,3,A man in a blue shirt standing in a garden .
4,data/flickr30k_images/flickr30k_images//100009...,4,Two friends enjoy time spent together .
5,data/flickr30k_images/flickr30k_images//100024...,0,Several men in hard hats are operating a gian...
6,data/flickr30k_images/flickr30k_images//100024...,1,Workers look down from up above on a piece of...
7,data/flickr30k_images/flickr30k_images//100024...,2,Two men working on a machine wearing hard hats .
8,data/flickr30k_images/flickr30k_images//100024...,3,Four men on top of a tall structure .
9,data/flickr30k_images/flickr30k_images//100024...,4,Three men on a large rig .


In [29]:
captionings_df["comment"].apply(len).describe()

count    158914.000000
mean         65.198894
std          26.869503
min           8.000000
25%          47.000000
50%          60.000000
75%          77.000000
max         407.000000
Name: comment, dtype: float64

In [30]:
captionings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158914 entries, 0 to 158914
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   image_name      158914 non-null  object
 1   comment_number  158914 non-null  object
 2   comment         158914 non-null  object
dtypes: object(3)
memory usage: 4.8+ MB


In [31]:
#Shuffle df
captionings_df = captionings_df.sample(frac=1,
                                       random_state=42,
                                       replace=False,
                                       )


n_train_examples = int(len(captionings_df) * (1 - VAL_FRACTION))

train_captionings_df = captionings_df[ : n_train_examples]
val_captionings_df = captionings_df[n_train_examples : ]

print("Train image-text examples: ", train_captionings_df.shape[0])
print("Validation image-text examples: ", val_captionings_df.shape[0])

#save splits
train_captionings_df.to_csv("splits/train_captions.csv", index=False)
val_captionings_df.to_csv("splits/val_captions.csv", index=False)

Train image-text examples:  150968
Validation image-text examples:  7946


In [32]:
train_captionings_df.columns

Index(['image_name', 'comment_number', 'comment'], dtype='object')

## Preprocessing

In [33]:
from data_processing import build_tokenizer, build_image_augmenter,  decode_and_resize


tokenizer = build_tokenizer()
tokenizer.adapt(train_captionings_df["comment"].tolist())

In [34]:
print(tokenizer.get_vocabulary()[:100])

['', '[UNK]', 'a', 'in', 'the', 'on', 'and', 'man', 'is', 'of', 'with', 'woman', 'two', 'are', 'to', 'people', 'at', 'an', 'wearing', 'white', 'young', 'shirt', 'black', 'while', 'his', 'blue', 'red', 'sitting', 'girl', 'men', 'boy', 'standing', 'dog', 'playing', 'street', 'group', 'down', 'front', 'her', 'walking', 'holding', 'one', 'water', 'by', 'three', 'women', 'green', 'up', 'looking', 'child', 'as', 'for', 'little', 'large', 'outside', 'yellow', 'person', 'children', 'brown', 'through', 'hat', 'their', 'from', 'other', 'ball', 'small', 'next', 'into', 'over', 'some', 'dressed', 'out', 'another', 'running', 'building', 'jacket', 'riding', 'around', 'orange', 'near', 'field', 'crowd', 'stands', 'beach', 'background', 'pink', 'sidewalk', 'behind', 'jumping', 'girls', 'table', 'sits', 'grass', 'bike', 'snow', 'that', 'looks', 'top', 'camera', 'air']


In [35]:
physical_devices = tf.config.list_physical_devices('GPU')
physical_devices


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [36]:
tokenizer("a tall man with blue t-shirt and a dog")

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([  2, 409,   7,  10,  25, 179,   6,   2,  32,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0])>

In [37]:
def process_input(img_path, captions):
    return decode_and_resize(img_path), tokenizer(captions)


def make_dataset(images, captions):
    dataset = tf.data.Dataset.from_tensor_slices((images, captions))
    dataset = dataset.shuffle(BATCH_SIZE * 8)
    dataset = dataset.map(process_input, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)

    return dataset

In [43]:
train_dataset = make_dataset(train_captionings_df["image_name"].tolist(),
                             train_captionings_df["comment"].tolist())

val_dataset = make_dataset(train_captionings_df["image_name"].tolist(),
                             train_captionings_df["comment"].tolist())



In [49]:
for img, cap in val_dataset.take(5):
    print(img.shape, cap.shape)

(64, 224, 224, 3) (64, 80)
(64, 224, 224, 3) (64, 80)
(64, 224, 224, 3) (64, 80)
(64, 224, 224, 3) (64, 80)
(64, 224, 224, 3) (64, 80)
