## CLIP and tensorflow similarity

In [2]:
import os
import gc
import tensorflow
import textwrap
import cv2
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

In [3]:
import tensorflow as tf

In [4]:
try:
  import tensorflow_similarity as tfsim
except ModuleNotFoundError:
  !pip install tensorflow_similarity
  import tensorflow_similarity as tfsim

import tensorflow_similarity.losses as tfsim_losses

Collecting tensorflow_similarity
  Downloading tensorflow_similarity-0.17.1-py3-none-any.whl (230 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.4/230.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting distinctipy (from tensorflow_similarity)
  Downloading distinctipy-1.2.3-py3-none-any.whl (25 kB)
Collecting nmslib (from tensorflow_similarity)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting umap-learn (from tensorflow_similarity)
  Downloading umap-learn-0.5.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11<2.6.2 (from nmslib->tensorflow_similarity)
  Using cached pybind11-2.6.1-py2.py3-none-any.wh

In [5]:
try:
  from transformers import TFCLIPTextModel, TFCLIPVisionModel, CLIPTokenizer, TFCLIPModel
except:
  !pip install transformers
  from transformers import TFCLIPTextModel, TFCLIPVisionModel, CLIPTokenizer, TFCLIPModel

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
Col

In [6]:
tfsim.utils.tf_cap_memory()
gc.collect()
tf.keras.backend.clear_session()

In [7]:
print("Tensorflow version:", tf.__version__)
print("Tensorflow Similarity version:", tfsim.__version__)

Tensorflow version: 2.13.0
Tensorflow Similarity version: 0.17.1


In [8]:
N_CPU = os.cpu_count()
IMG_SIZE = 224
BATCH_SIZE = 64
COLOR_CHANNELS = 3
N_TOKENS = 77
DATA_DIR = "deep_fashion_multi_modal"

## Data prep

In [9]:
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

In [10]:
### loading datasets from drive

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
!mkdir /content/deep_fashion_multi_modal/images/

In [None]:
!unzip /content/drive/MyDrive/deepfashion/Copy\ of\ images.zip -d /content/deep_fashion_multi_modal/images/

In [None]:
root_dir = '/content/deep_fashion_multi_modal/'

In [None]:
images_list = glob.glob(os.path.join(root_dir, 'images/images/*.jpg'))

In [None]:
len(images_list)

In [None]:
## Check images in image list
image_id = 120
image = cv2.imread(images_list[image_id])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.imshow(image)
plt.show()

In [None]:
## loading captions
import json

In [None]:
!cp /content/drive/MyDrive/deepfashion/Copy\ of\ captions.json /content/deep_fashion_multi_modal/

In [None]:
with open("/content/deep_fashion_multi_modal/Copy of captions.json") as f:
  d = json.loads(f.read())

In [None]:
#d

In [None]:
df = pd.DataFrame()

In [None]:
df['image_paths'] = images_list
df['file_name'] = df['image_paths'].apply(lambda x: x.split('/')[-1])
df['description'] = df['file_name'].apply(lambda x: d[x] if x in d else None)

In [None]:
df.head()

In [None]:
for i in df[:5].itertuples():
  image = cv2.imread(i.image_paths)
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  print('description:',i.description)
  print('text length:', len(i.description))
  plt.imshow(image)
  plt.show()

In [26]:
df.isnull().sum()

image_paths       0
file_name         0
description    1552
dtype: int64

In [27]:
df = df[~(df['description'].isnull())]

In [28]:
df.isnull().sum()

image_paths    0
file_name      0
description    0
dtype: int64

In [29]:
text_list = df['description'].to_list()
imgs_list = df['image_paths'].to_list()

In [30]:
train_images, val_images, train_texts, val_texts = train_test_split(
    imgs_list, text_list, test_size=0.2, random_state=17
)

In [31]:
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

Downloading (…)okenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

In [32]:
train_tokens = tokenizer(
    train_texts,
    padding="max_length",
    return_tensors="tf",
    truncation=True
)

In [33]:
val_tokens = tokenizer(
    val_texts,
    padding="max_length",
    return_tensors="tf",
    truncation=True,
)

In [34]:
train_tokens[:10]


{'input_ids': <tf.Tensor: shape=(10, 77), dtype=int32, numpy=
 array([[49406,   589,  2533, 11869,   320,  3005,   268, 10536,   339,
           268,  2523,   593,  5979,  3140, 11637,   269,   518,   339,
           268,  2523,   533,   593,  7050, 10033,   269,   585,   791,
           320,  2522,  6066,  1148,   269,   518,  9680,   589,  2533,
         11869,   533,   539,  2097,   268,  2301, 10130,   269,   518,
          9680,   631,   593, 13606, 10033,   537,  5979,  3140, 11637,
           269,   997,   533,   320,  2540,   525,   899,  8895,   269,
           997,   533,   550, 20417,   525,   899, 16139,   269, 49407,
         49407, 49407, 49407, 49407, 49407],
        [49406,   899,  2523,   791,  1538, 19691,   267, 18863, 10033,
           537,  6148,  3140, 11637,   269,   585,   791,   320, 47961,
          6066,  1148,   269,   518,  5003,   589,  2909, 11869,   533,
           539,  1538, 10130,   269,   518,  5003,   631,   593,  5862,
         10033,   537,  6148,

In [35]:
def get_image_embed(image_path):
  image = tf.io.read_file(image_path)
  image = tf.image.decode_jpeg(image, channels=3)
  image = tf.image.convert_image_dtype(image, tf.float32)
  image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE], method="nearest")
  image = tf.transpose(image, [2,0,1])
  return image

In [36]:
def data_mapper(img, input_ids, attention_mask):
  return get_image_embed(img), tf.squeeze(input_ids), tf.squeeze(attention_mask)

In [37]:
train_ds = (
    tf.data.Dataset.from_tensor_slices((train_images, train_tokens['input_ids'], train_tokens['attention_mask']))
    .map(data_mapper, num_parallel_calls=tf.data.AUTOTUNE)
    .cache()
    .shuffle(2000)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

print('Train Dataset Shapes')
for i in train_ds.take(1):
  for nn, tensor in zip(["Image", "Input ids", "Attention Masks"], i):
    print(f"{nn}: {tensor.shape}")


Train Dataset Shapes
Image: (64, 3, 224, 224)
Input ids: (64, 77)
Attention Masks: (64, 77)


In [38]:
train_tokens[:2]

{'input_ids': <tf.Tensor: shape=(2, 77), dtype=int32, numpy=
 array([[49406,   589,  2533, 11869,   320,  3005,   268, 10536,   339,
           268,  2523,   593,  5979,  3140, 11637,   269,   518,   339,
           268,  2523,   533,   593,  7050, 10033,   269,   585,   791,
           320,  2522,  6066,  1148,   269,   518,  9680,   589,  2533,
         11869,   533,   539,  2097,   268,  2301, 10130,   269,   518,
          9680,   631,   593, 13606, 10033,   537,  5979,  3140, 11637,
           269,   997,   533,   320,  2540,   525,   899,  8895,   269,
           997,   533,   550, 20417,   525,   899, 16139,   269, 49407,
         49407, 49407, 49407, 49407, 49407],
        [49406,   899,  2523,   791,  1538, 19691,   267, 18863, 10033,
           537,  6148,  3140, 11637,   269,   585,   791,   320, 47961,
          6066,  1148,   269,   518,  5003,   589,  2909, 11869,   533,
           539,  1538, 10130,   269,   518,  5003,   631,   593,  5862,
         10033,   537,  6148, 

In [39]:
val_ds = (
    tf.data.Dataset.from_tensor_slices((val_images, val_tokens["input_ids"], val_tokens["attention_mask"]))
    .map(data_mapper, num_parallel_calls=N_CPU)
    .cache()
    .batch(BATCH_SIZE)
)

print("\n")
print("Val Dataset Shapes")
for i in val_ds.take(1):
    for nm, tensor in zip(["Image", "Input Id", "Attention Mask"], i):
        print(f"{nm}: {tensor.shape}")



Val Dataset Shapes
Image: (64, 3, 224, 224)
Input Id: (64, 77)
Attention Mask: (64, 77)


## Model Setup

Loading CLIP Weights

In [40]:
model = TFCLIPModel.from_pretrained('openai/clip-vit-base-patch32')
vision_weights = tf.Variable(model.weights[-2])
text_weights = tf.Variable(model.weights[-1])

del model

tf.keras.backend.clear_session()

Downloading tf_model.h5:   0%|          | 0.00/606M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


In [41]:
print('vison model weights shape', vision_weights.shape)
print('text model weights shape', text_weights.shape)

vison model weights shape (768, 512)
text model weights shape (512, 512)


## Load Pretrained CLIP Text Model

In [42]:
CLIP_text_model = TFCLIPTextModel.from_pretrained(
    "openai/clip-vit-base-patch32"
)

Some layers from the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing TFCLIPTextModel: ['clip/vision_model/encoder/layers_._2/layer_norm1/gamma:0', 'clip/vision_model/encoder/layers_._7/mlp/fc1/bias:0', 'clip/vision_model/encoder/layers_._7/self_attn/q_proj/kernel:0', 'clip/vision_model/encoder/layers_._2/mlp/fc1/kernel:0', 'clip/vision_model/encoder/layers_._8/layer_norm1/beta:0', 'clip/vision_model/encoder/layers_._2/self_attn/out_proj/kernel:0', 'clip/vision_model/embeddings/class_embedding:0', 'clip/vision_model/encoder/layers_._1/self_attn/out_proj/bias:0', 'clip/vision_model/encoder/layers_._9/self_attn/v_proj/kernel:0', 'clip/vision_model/encoder/layers_._11/self_attn/k_proj/bias:0', 'clip/vision_model/encoder/layers_._0/self_attn/k_proj/kernel:0', 'clip/vision_model/encoder/layers_._8/self_attn/k_proj/kernel:0', 'clip/vision_model/encoder/layers_._6/layer_norm2/gamma:0', 'clip/vision_model/encoder/layers_._6/mlp/fc2/kernel:0', 'clip/vision_model/

## Loading pretrained CLIP Vision Model

In [43]:
CLIP_vision_model = TFCLIPVisionModel.from_pretrained(
    "openai/clip-vit-base-patch32"
)

Some layers from the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing TFCLIPVisionModel: ['clip/text_model/encoder/layers_._0/mlp/fc2/bias:0', 'clip/text_model/encoder/layers_._10/layer_norm1/beta:0', 'clip/text_model/encoder/layers_._8/mlp/fc1/kernel:0', 'clip/text_model/encoder/layers_._0/self_attn/out_proj/kernel:0', 'clip/text_model/encoder/layers_._4/self_attn/k_proj/kernel:0', 'clip/text_model/encoder/layers_._6/self_attn/q_proj/bias:0', 'clip/text_model/encoder/layers_._11/self_attn/k_proj/bias:0', 'clip/text_model/encoder/layers_._8/mlp/fc2/kernel:0', 'clip/text_model/encoder/layers_._8/self_attn/k_proj/bias:0', 'clip/text_model/encoder/layers_._9/mlp/fc1/bias:0', 'clip/text_model/encoder/layers_._4/mlp/fc1/bias:0', 'clip/text_model/encoder/layers_._9/layer_norm2/beta:0', 'clip/text_model/encoder/layers_._10/self_attn/q_proj/bias:0', 'clip/text_model/encoder/layers_._10/mlp/fc1/bias:0', 'clip/text_model/encoder/layers_._2/self_attn/k_proj/bias:0'

In [44]:
CLIP_vision_model.trainable = False
CLIP_text_model.trainable = False

In [45]:
CLIP_text_model.summary()

Model: "tfclip_text_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 clip (TFCLIPTextMainLayer)  multiple                  63165952  
                                                                 
Total params: 63165952 (240.96 MB)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 63165952 (240.96 MB)
_________________________________________________________________


In [46]:
CLIP_vision_model.summary()

Model: "tfclip_vision_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 clip (TFCLIPVisionMainLaye  multiple                  87456000  
 r)                                                              
                                                                 
Total params: 87456000 (333.62 MB)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 87456000 (333.62 MB)
_________________________________________________________________


In [47]:
def get_image_model(n_dims=512):
    x = inputs = tf.keras.layers.Input((COLOR_CHANNELS, IMG_SIZE, IMG_SIZE), name="image")
    x = CLIP_vision_model(x).pooler_output  # pooled CLS states
    kernel_weights = tf.constant_initializer(vision_weights.numpy())
    # Projection layer
    embed = tf.keras.layers.Dense(n_dims, name="image_embedding", kernel_initializer=kernel_weights)(x)

    model = tf.keras.models.Model(inputs=inputs, outputs=embed, name="image_model")
    return model

In [48]:
def get_text_model(n_dims=512):
  inputs1 = tf.keras.layers.Input((N_TOKENS), dtype=tf.int32, name="input_ids")
  inputs2 = tf.keras.layers.Input((N_TOKENS), dtype=tf.int32, name="attention_masks")
  x = CLIP_text_model(input_ids=inputs1, attention_mask=inputs2).pooler_output #pooled CLS states
  kernel_weights = tf.constant_initializer(text_weights.numpy())

  embed = tf.keras.layers.Dense(n_dims, name='text_embedding', kernel_initializer= kernel_weights)(x)
  model = tf.keras.models.Model(inputs=[inputs1, inputs2],outputs=embed, name="text_model")
  return model

In [49]:
img_model = get_image_model()
text_model = get_text_model()

img_model.summary()
print("\n")
text_model.summary()

Model: "image_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 image (InputLayer)          [(None, 3, 224, 224)]     0         
                                                                 
 tfclip_vision_model (TFCLI  TFBaseModelOutputWithPo   87456000  
 PVisionModel)               oling(last_hidden_state             
                             =(None, 50, 768),                   
                              pooler_output=(None, 7             
                             68),                                
                              hidden_states=None, at             
                             tentions=None)                      
                                                                 
 image_embedding (Dense)     (None, 512)               393728    
                                                                 
Total params: 87849728 (335.12 MB)
Trainable params: 39

In [50]:
loss_fn = tfsim_losses.MultiNegativesRankLoss()

In [51]:
val_loss = 0
base_image_embeddings = []
base_text_embeddings = []
for image_batch, input_ids_batch, attention_mask_batch in tqdm(val_ds):
    image_embedding = img_model(image_batch, training=False)
    text_embedding = text_model([input_ids_batch, attention_mask_batch], training=False)

    image_embedding = tf.math.l2_normalize(image_embedding, axis=1)
    text_embedding = tf.math.l2_normalize(text_embedding, axis=1)

    base_image_embeddings.append(image_embedding.numpy())
    base_text_embeddings.append(text_embedding.numpy())

    # Compute the loss value for this minibatch.
    loss_value = loss_fn(text_embedding, image_embedding)
    val_loss += float(loss_value)

print(f"\nMean Validation Loss: {val_loss / len(val_ds)}")
base_image_embeddings = np.concatenate(base_image_embeddings)
base_text_embeddings = np.concatenate(base_text_embeddings)


100%|██████████| 133/133 [01:52<00:00,  1.18it/s]


Mean Validation Loss: 3.6794318722603014





Using R@k metric for evaluation. A common metric used in evaluation of ranking. It is the average of recall by taking top k predictions

In [52]:
def recall_at_k(sim_matrix, k=1):
  """
  It is the mean ratio of correctly retrieved documents
  to the number of relevant documents.
  This implementation is specific to data having unique label
  for each key
  """
  sorted_mat = np.argsort(sim_matrix, axis=1)[:, -k:]
  print('sorted_matrix', sorted_mat)
  true_labels = np.arange(sorted_mat.shape[0]).reshape(-1,1)
  true_labels = np.repeat(true_labels, k, axis=1)
  sorted_mat = sorted_mat - true_labels

  tps = np.any(sorted_mat==0, axis=1)
  return tps.mean()


In [53]:
base_sim_mat = np.matmul(base_text_embeddings, base_image_embeddings.T)
for k in range(1, 6):
    print("R@{}: {}".format(k, recall_at_k(base_sim_mat, k)))

sorted_matrix [[7840]
 [3676]
 [ 423]
 ...
 [4029]
 [4286]
 [1629]]
R@1: 0.0037607239393583265
sorted_matrix [[3934 7840]
 [8101 3676]
 [1328  423]
 ...
 [2404 4029]
 [ 589 4286]
 [3935 1629]]
R@2: 0.007403925255611705
sorted_matrix [[1264 3934 7840]
 [4535 8101 3676]
 [8109 1328  423]
 ...
 [3934 2404 4029]
 [6108  589 4286]
 [7807 3935 1629]]
R@3: 0.009871900340815608
sorted_matrix [[7226 1264 3934 7840]
 [3912 4535 8101 3676]
 [7629 8109 1328  423]
 ...
 [7840 3934 2404 4029]
 [1441 6108  589 4286]
 [5868 7807 3935 1629]]
R@4: 0.011634739687389822
sorted_matrix [[8034 7226 1264 3934 7840]
 [8461 3912 4535 8101 3676]
 [1734 7629 8109 1328  423]
 ...
 [8034 7840 3934 2404 4029]
 [7815 1441 6108  589 4286]
 [1584 5868 7807 3935 1629]]
R@5: 0.014455282641908568


## model training

In [56]:
epochs = 5
img_optimizer = tf.keras.optimizers.Adam(1e-5)
text_optimizer = tf.keras.optimizers.Adam(1e-5)
train_step_losses = []
train_epoch_losses = []

@tf.function
def train_step(image_batch, text_batch):
  with tf.GradientTape() as img_tape, tf.GradientTape() as text_tape:
    image_embedding = img_model(image_batch, training = True)
    text_embedding = text_model(text_batch, training =True)

    image_embedding = tf.math.l2_normalize(image_embedding, axis=1)
    text_embedding = tf.math.l2_normalize(text_embedding, axis=1)

    #compute the loss value for this minibatch
    loss_value = loss_fn(text_embedding, image_embedding)

    img_grads = img_tape.gradient(loss_value, img_model.trainable_weights)
    text_grad = text_tape.gradient(loss_value, text_model.trainable_weights)

    img_optimizer.apply_gradients(zip(img_grads, img_model.trainable_weights))
    text_optimizer.apply_gradients(zip(text_grad, text_model.trainable_weights))

    return loss_value

In [None]:
for epoch in range(epochs):
  print(f"\n Epoch {epoch +1}")
  epoch_loss = 0

  for step, (image_batch, input_ids_batch, attention_mask_batch) in enumerate(train_ds):
    loss_value = train_step(image_batch, [input_ids_batch, attention_mask_batch])
    epoch_loss += float(loss_value)
    train_step_losses.append(float(loss_value)/ image_batch.shape[0])

    if step % 100 == 0:
      print(f"Training loss (for one batch) at step {step + 1}: {float(loss_value):.4f}")
      print("Seen so far: %s samples" % ((step + 1) * BATCH_SIZE))
  print(f"Epoch loss: {epoch_loss/ len(train_ds)}")
  train_epoch_losses.append(epoch_loss / len(train_ds))


 Epoch 1
Training loss (for one batch) at step 1: 3.6568
Seen so far: 64 samples


In [None]:
plt.plot(train_epoch_losses)
plt.title("Training")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

In [None]:
plt.plot(train_step_losses)
plt.title("Training")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.show()

In [None]:
val_loss = 0
image_embeddings = []
text_embeddings = []
for image_batch, input_ids_batch, attention_mask_batch in tqdm(val_ds):
    image_embedding = img_model(image_batch, training=False)
    text_embedding = text_model([input_ids_batch, attention_mask_batch], training=False)

    image_embedding = tf.math.l2_normalize(image_embedding, axis=1)
    text_embedding = tf.math.l2_normalize(text_embedding, axis=1)

    image_embeddings.append(image_embedding.numpy())
    text_embeddings.append(text_embedding.numpy())

    # Compute the loss value for this minibatch.
    loss_value = loss_fn(text_embedding, image_embedding)
    val_loss += float(loss_value)

print(f"Mean Validation Loss: {val_loss / len(val_ds)}")
image_embeddings = np.concatenate(image_embeddings)
text_embeddings = np.concatenate(text_embeddings)


In [None]:
finetuned_sim = np.matmul(text_embeddings, image_embeddings.T)

for k in range(1, 6):
    print("R@{} : {}".format(k, recall_at_k(finetuned_sim, k)))

In [None]:
img_model.save("image_model.h5")
text_model.save("text_model.h5")

In [None]:
# img_model = tf.keras.models.load_model("image_model.h5", custom_objects={"TFCLIPVisionModel": TFCLIPVisionModel})
# text_model = tf.keras.models.load_model("text_model.h5", custom_objects={"TFCLIPTextModel": TFCLIPTextModel})