In [1]:
import tensorflow as tf
import numpy as np
import tensorflow.keras as keras
import pandas as pd
import os
from keras import layers

from train_config import *


os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"



2024-04-11 18:21:46.596563: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-11 18:21:46.624458: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
captionings_df = pd.read_csv(os.path.join(DATA_PATH, "results.csv"), sep="|").dropna()
captionings_df.columns = ["image_name", "comment_number", "comment"]
captionings_df["image_name"] = IMAGES_PATH + "/" + captionings_df["image_name"] 


#ADDING START AND END special tokens
captionings_df["comment"] = "<START> " + captionings_df["comment"] + " <END>"
captionings_df.head(10)

Unnamed: 0,image_name,comment_number,comment
0,data/flickr30k_images/flickr30k_images//100009...,0,<START> Two young guys with shaggy hair look ...
1,data/flickr30k_images/flickr30k_images//100009...,1,"<START> Two young , White males are outside n..."
2,data/flickr30k_images/flickr30k_images//100009...,2,<START> Two men in green shirts are standing ...
3,data/flickr30k_images/flickr30k_images//100009...,3,<START> A man in a blue shirt standing in a g...
4,data/flickr30k_images/flickr30k_images//100009...,4,<START> Two friends enjoy time spent together...
5,data/flickr30k_images/flickr30k_images//100024...,0,<START> Several men in hard hats are operatin...
6,data/flickr30k_images/flickr30k_images//100024...,1,<START> Workers look down from up above on a ...
7,data/flickr30k_images/flickr30k_images//100024...,2,<START> Two men working on a machine wearing ...
8,data/flickr30k_images/flickr30k_images//100024...,3,<START> Four men on top of a tall structure ....
9,data/flickr30k_images/flickr30k_images//100024...,4,<START> Three men on a large rig . <END>


In [3]:
#Shuffle df
captionings_df = captionings_df.sample(frac=1,
                                       random_state=42,
                                       replace=False,
                                       )


n_train_examples = int(len(captionings_df) * (1 - VAL_FRACTION))

train_captionings_df = captionings_df[ : n_train_examples]
val_captionings_df = captionings_df[n_train_examples : ]

print("Train image-text examples: ", train_captionings_df.shape[0])
print("Validation image-text examples: ", val_captionings_df.shape[0])

#save splits
train_captionings_df.to_csv("splits/train_captions.csv", index=False)
val_captionings_df.to_csv("splits/val_captions.csv", index=False)

Train image-text examples:  150968
Validation image-text examples:  7946


In [4]:
from data_processing import build_tokenizer, build_image_augmenter,  decode_and_resize


tokenizer = build_tokenizer()
tokenizer.adapt(train_captionings_df["comment"].tolist())

def process_input(img_path, captions):
    return decode_and_resize(img_path), tf.reshape(tokenizer(captions), shape=(1, SEQ_LENGTH))

def make_dataset(images, captions):
    dataset = tf.data.Dataset.from_tensor_slices((images, captions))
    dataset = dataset.shuffle(BATCH_SIZE * 8)
    dataset = dataset.map(process_input, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)

    return dataset


2024-04-11 17:54:26.142602: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-11 17:54:26.162765: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-11 17:54:26.162854: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [5]:
train_dataset = make_dataset(train_captionings_df["image_name"].tolist(),
                             train_captionings_df["comment"].tolist())

val_dataset = make_dataset(train_captionings_df["image_name"].tolist(),
                             train_captionings_df["comment"].tolist())


In [6]:
for img, cap in val_dataset.take(5):
    print(img.shape, cap.shape)

(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)


In [7]:
from model import TransformerDecoderBlock, TransformerEncoderBlock, ImageCaptioningModel, get_cnn_model

In [8]:
base_model = keras.applications.efficientnet.EfficientNetB1(
        input_shape=(*IMAGE_SIZE, 3),
        include_top=False,
        weights="imagenet",
    )

cnn = get_cnn_model(base_model)

encoder = TransformerEncoderBlock(
    embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=ENC_HEADS
)
decoder = TransformerDecoderBlock(
    embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=DEC_HEADS, 
)

caption_model = ImageCaptioningModel(
    cnn_model=cnn,
    #image_aug=None,
    encoder=encoder, 
    decoder=decoder
)





In [9]:
X_batch, y_batch = next(iter(train_dataset.take(1))) #SANITY CHECK


cross_entropy = keras.losses.SparseCategoricalCrossentropy(
    from_logits=False,
    reduction="none"
)


caption_model.compile(optimizer=keras.optimizers.Adam(0.001), loss=cross_entropy)

In [None]:
caption_model.fit(train_dataset, validation_data=val_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
   6/2359 [..............................] - ETA: 2:51 - loss: 3.4547 - acc: 0.3495

In [11]:
#save
caption_model.save_weights("caption_weights.h5")

In [12]:


from training_utils import load_trained_model_weights



new_model = load_trained_model_weights("caption_weights.h5")

In [13]:
new_model.summary()

Model: "image_captioning_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_1 (Functional)        (None, 49, 1280)          6575239   
                                                                 
 transformer_encoder_block_  multiple                  1710080   
 1 (TransformerEncoderBlock                                      
 )                                                               
                                                                 
 transformer_decoder_block_  multiple                  14733840  
 1 (TransformerDecoderBlock                                      
 )                                                               
                                                                 
Total params: 23019163 (87.81 MB)
Trainable params: 16443920 (62.73 MB)
Non-trainable params: 6575243 (25.08 MB)
_____________________________________________________________

In [14]:
new_model.compile(optimizer=keras.optimizers.Adam(0.001), loss=cross_entropy)

new_model.evaluate(X_batch, y_batch)



[3.3812708854675293, 0.37292519211769104]

In [18]:
import gc

In [19]:
gc.collect()

10619