In [1]:
import tensorflow as tf
import numpy as np
import tensorflow.keras as keras
import pandas as pd
import os
from keras import layers

from train_config import *



2024-04-11 16:09:04.797598: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-11 16:09:04.821027: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
captionings_df = pd.read_csv(os.path.join(DATA_PATH, "results.csv"), sep="|").dropna()
captionings_df.columns = ["image_name", "comment_number", "comment"]
captionings_df["image_name"] = IMAGES_PATH + "/" + captionings_df["image_name"] 


#ADDING START AND END special tokens
captionings_df["comment"] = "<START> " + captionings_df["comment"] + " <END>"
captionings_df.head(10)

Unnamed: 0,image_name,comment_number,comment
0,data/flickr30k_images/flickr30k_images//100009...,0,<START> Two young guys with shaggy hair look ...
1,data/flickr30k_images/flickr30k_images//100009...,1,"<START> Two young , White males are outside n..."
2,data/flickr30k_images/flickr30k_images//100009...,2,<START> Two men in green shirts are standing ...
3,data/flickr30k_images/flickr30k_images//100009...,3,<START> A man in a blue shirt standing in a g...
4,data/flickr30k_images/flickr30k_images//100009...,4,<START> Two friends enjoy time spent together...
5,data/flickr30k_images/flickr30k_images//100024...,0,<START> Several men in hard hats are operatin...
6,data/flickr30k_images/flickr30k_images//100024...,1,<START> Workers look down from up above on a ...
7,data/flickr30k_images/flickr30k_images//100024...,2,<START> Two men working on a machine wearing ...
8,data/flickr30k_images/flickr30k_images//100024...,3,<START> Four men on top of a tall structure ....
9,data/flickr30k_images/flickr30k_images//100024...,4,<START> Three men on a large rig . <END>


In [3]:
#Shuffle df
captionings_df = captionings_df.sample(frac=1,
                                       random_state=42,
                                       replace=False,
                                       )


n_train_examples = int(len(captionings_df) * (1 - VAL_FRACTION))

train_captionings_df = captionings_df[ : n_train_examples]
val_captionings_df = captionings_df[n_train_examples : ]

print("Train image-text examples: ", train_captionings_df.shape[0])
print("Validation image-text examples: ", val_captionings_df.shape[0])

#save splits
train_captionings_df.to_csv("splits/train_captions.csv", index=False)
val_captionings_df.to_csv("splits/val_captions.csv", index=False)

Train image-text examples:  150968
Validation image-text examples:  7946


In [4]:
from data_processing import build_tokenizer, build_image_augmenter,  decode_and_resize


tokenizer = build_tokenizer()
tokenizer.adapt(train_captionings_df["comment"].tolist())

def process_input(img_path, captions):
    return decode_and_resize(img_path), tf.reshape(tokenizer(captions), shape=(1, SEQ_LENGTH))

def make_dataset(images, captions):
    dataset = tf.data.Dataset.from_tensor_slices((images, captions))
    dataset = dataset.shuffle(BATCH_SIZE * 8)
    dataset = dataset.map(process_input, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)

    return dataset


2024-04-11 16:09:07.951750: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-11 16:09:07.971647: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-11 16:09:07.971727: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [5]:
train_dataset = make_dataset(train_captionings_df["image_name"].tolist(),
                             train_captionings_df["comment"].tolist())

val_dataset = make_dataset(train_captionings_df["image_name"].tolist(),
                             train_captionings_df["comment"].tolist())


In [6]:
for img, cap in val_dataset.take(5):
    print(img.shape, cap.shape)

(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)


In [9]:
from model import TransformerDecoderBlock, TransformerEncoderBlock, ImageCaptioningModel, get_cnn_model

In [12]:
base_model = keras.applications.efficientnet.EfficientNetB1(
        input_shape=(*IMAGE_SIZE, 3),
        include_top=False,
        weights="imagenet",
    )

cnn = get_cnn_model(base_model)

encoder = TransformerEncoderBlock(
    embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1
)
decoder = TransformerDecoderBlock(
    embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2, 
)

caption_model = ImageCaptioningModel(
    cnn_model=cnn,
    #image_aug=None,
    encoder=encoder, 
    decoder=decoder
)





In [13]:
X_batch, y_batch = next(iter(train_dataset.take(1))) #SANITY CHECK


cross_entropy = keras.losses.SparseCategoricalCrossentropy(
    from_logits=False,
    reduction="none"
)


caption_model.compile(optimizer=keras.optimizers.Adam(0.01), loss=cross_entropy)

In [14]:
caption_model.fit(X_batch, y_batch, epochs=20)

Epoch 1/20


2024-04-11 16:11:24.559458: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-04-11 16:11:24.571112: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902
2024-04-11 16:11:24.754642: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x76db7e5ba6b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-04-11 16:11:24.754660: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4090 Laptop GPU, Compute Capability 8.9
2024-04-11 16:11:24.757260: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-04-11 16:11:24.817253: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifeti

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x76defc2dca00>