# Declarations

## Imports

In [1]:
import os
import re
import math
import string
import random
import requests
import importlib
import itertools

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import plotly.graph_objects as go

from tqdm import tqdm

from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

tfk = tf.keras
tfkl = tf.keras.layers
kb = tf.keras.backend

2023-12-11 22:08:41.425316: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-11 22:08:41.425369: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-11 22:08:41.425423: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.14.0
Num GPUs Available:  1


## Constants

In [4]:
# Randomness
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

In [5]:
# Filepaths
kaggle = False

model_versions = ["v4.0", "v5.0"]

github_repo = "raul-singh/Rise-of-Transformers-Project"
github_branch = "main"
github_python_prefix = ["Code", "Notebooks", "py_files"]
github_clip_models_prefix = ["Code", "Models"] if kaggle else ["..", "Models"]
github_pyfiles_data = [
    {"name": "preprocessing", "imports": ["import_datasets"]}, 
    {"name": "evaluation", "imports": [
        "EvalMetrics as evm", "compute_total_relevance", "generate_image_embeddings", "generate_text_embeddings", 
        "find_t2i_matches", "find_i2t_matches", "index_to_reference", "compute_relevant_at_k"
    ]}, 
    {"name": "visualization", "imports": ["retrieval_report", "retrieval_graph_compare"]}, 
    {"name": "clip", "imports": ["build_clip"]}
]
github_pyfiles = ["/".join(github_python_prefix) + "/" + pf["name"] + ".py" for pf in github_pyfiles_data]
github_clip_models = [f"{'/'.join(github_clip_models_prefix)}/{version}.yaml" for version in model_versions]

kaggle_dataset1 = "/kaggle/input/transformers-hackathon/"
kaggle_dataset2 = "/kaggle/input/transformers-hackathon-features/"
kaggle_weights = "/kaggle/input/clip-weights/"
kaggle_relevance = "/kaggle/input/clip-relevance/"

image_dir = "./resized_train"
relevance_dir = "./relevance"
caption_pred_file = "caption_prediction_train.csv"
concept_det_file = "concept_detection_train.csv"
concept_file = "concepts.csv"
clip_weights_files = [f"{version}.h5" for version in model_versions] if kaggle else [None for _ in model_versions]

if kaggle:
    image_dir = kaggle_dataset1 + image_dir
    relevance_dir = kaggle_relevance + relevance_dir
    caption_pred_file = kaggle_dataset2 + caption_pred_file
    concept_det_file = kaggle_dataset2 + concept_det_file
    concept_file = kaggle_dataset2 + concept_file
    clip_weights_files = [kaggle_weights + weight for weight in clip_weights_files]

In [6]:
# Train/Val/Test split and filter percentages
test_size = 0.2
val_size = 0
filter_percent_dataset = 1

# Batch size
batch_size = 32

# Import dataset types and shapes
in_feat_typ = {'caption': tf.string, 'concepts': tf.bool, 'image path': tf.string}
feature_shapes = {'image': (128, 128, 3), 'caption': (), 'concepts': (8374)}

# Output dataset structure
x_features_eval = ['image path', 'image']
y_features_eval = ['caption', 'concepts']

# Define parameters for dataset import
dataset_parameters = [{
    'x_features': x_features_eval, 'y_features': y_features_eval,
    'x_dict': True, 'y_dict': True,           
    'shuffle_buffer_size': 1,
    'batch_size': batch_size,
    'cached': True,
}]

## Meta-Imports

In [7]:
def clean_recursive_imports(source, import_list, prefix):
    import_prefix = re.sub(r"/", ".", prefix)
    for target_import in import_list:
        source = re.sub(r"from[ \t]+" + re.escape(target_import) + r"[ \t]+import", f"from {import_prefix + target_import} import", source)
    return source
    
def import_py_from_repo(repository, branch, filepath, prefix, recursive_imports_list=None):
    # Build path for retrieval and write name
    path_pre = "https://raw.githubusercontent.com/"
    path = path_pre + repository + "/" + branch + "/" + filepath 
    write_path = prefix + filepath.split("/")[-1]
    print("Downloading file from " + path)
    # Obtain raw text from file
    text = requests.get(path).text
    # Clean recursive imports
    text = clean_recursive_imports(text, recursive_imports_list, prefix) if recursive_imports_list else text
    # Create subdirectories if not exist
    os.makedirs(os.path.dirname(write_path), exist_ok=True)
    # Write file
    f = open(write_path, "w")
    f.write(text)
    f.close()

In [9]:
if kaggle:
    for pf_data, py_file in zip(github_pyfiles_data, github_pyfiles):
        import_py_from_repo(
            github_repo, github_branch, py_file, 
            "/".join(github_python_prefix) + "/", 
            recursive_imports_list=[pf["name"] for pf in github_pyfiles_data],
        )
        import_string = f'from {".".join(github_python_prefix) + "." + pf_data["name"]} import {", ".join(pf_data["imports"])}'
        exec(import_string)
    
    for model in github_clip_models:
        import_py_from_repo(github_repo, github_branch, model, "/".join(github_clip_models_prefix) + "/")
        
else:
    for pf_data in github_pyfiles_data:
        import_string = f'from py_files.{pf_data["name"]} import {", ".join(pf_data["imports"])}'
        exec(import_string)

# Preprocessing

In [10]:
concept_info, datasets, dataset_sizes = import_datasets(
    image_dir, caption_pred_file, concept_file, concept_det_file,
    in_feat_typ, feature_shapes,
    dataset_parameters,
    filter_percent_dataset,
    test_size, val_size,
    seed,
)

2023-12-11 22:19:02.644207: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.


Extracting features from CSV file(s)


83275it [00:48, 1707.72it/s]


In [11]:
# Select loaded datasets and variables
concept_list, concepts_onehot = concept_info
_, _, test_dataset = datasets[0]
train_ds_size, val_ds_size, test_ds_size = dataset_sizes

# Model Import

In [12]:
models = []
for structure, weights, version in zip(github_clip_models, clip_weights_files, model_versions):
    print(f"Creating model {version} from {structure}")
    clip_image_encoder, clip_text_encoder, clip = build_clip(structure, weights_path=weights)
    models.append({
        "name": version,
        "image_encoder": clip_image_encoder,
        "text_encoder": clip_text_encoder,
        "clip": clip,
    })

Creating model v4.0 from ../Models/v4.0.yaml
Downloading models...
Models downloaded.
Building clip...
Loading parameters...
../Models/Weights/v4.0.h5
Done.
Creating model v5.0 from ../Models/v5.0.yaml
Downloading models...
Models downloaded.
Building clip...
Loading parameters...
../Models/Weights/v5.0.h5
Done.


# Model Evaluation

## Evaluation Variables

In [13]:
# Top-k number
k = 10
# Threshold for concept overlap metric
concept_overlap_threshold = 2
# Visualization decimal precision
decimal_precision = 4
# Index to choose model from the array of models
model_index = 0
# Dictionaries used to load/save total relevance files
relevance_fileinfo_cap = {"path": relevance_dir, "test_split": test_size, "val_split": val_size, "metric": "cap"}
relevance_fileinfo_con = {"path": relevance_dir, "test_split": test_size, "val_split": val_size, "metric": "con", "other": [("conthresh", concept_overlap_threshold)]}
# Function to preprocess data when we want to evaluate captions
reference_preprocess_cap = lambda x: x["caption"].numpy().decode('UTF-8')          
# Function to preprocess data when we want to evaluate concepts
reference_preprocess_con = lambda x: x["concepts"].numpy()
reference_preprocess_con_hash = lambda x: frozenset(sorted(np.where(x["concepts"].numpy())[0]))
# Function to compute if a match is relevant given concept arrays 
concept_relevance = lambda m, o: np.count_nonzero(np.logical_and(m, o)) >= min(concept_overlap_threshold, np.count_nonzero(m), np.count_nonzero(o))
concept_relevance_hash = lambda m, o: len(m.intersection(o)) >= min(concept_overlap_threshold, len(m), len(o))

In [14]:
# Metric IDs
METRIC_ACCURACY = "Accuracy"
METRIC_MAP = "MAP"
METRIC_MAR = "MAR"
METRIC_F1 = "F1"

# Metric visualization parameters
metrics = [
    {"id": METRIC_ACCURACY, "name": "Accuracy", "color": "green"},
    {"id": METRIC_MAP, "name": "Mean Average Precision", "color": "blue"},
    {"id": METRIC_MAR, "name": "Mean Average Recall", "color": "red"},
    {"id": METRIC_F1, "name": "F1 Score", "color": "blueviolet"}
]

## Dataset Metrics

In [17]:
# Generating embeddings for image-to-text and text-to-image tasks
reference_flag = False
embeddings = []
for model in models:
    if not reference_flag:
        dataset_reference, test_image_embeddings = generate_image_embeddings(
            model["image_encoder"],
            test_dataset,
            dataset_pred_map=lambda x, y: x['image'],
            dataset_ref_map=lambda x, y: y | {'image path': x['image path']}
        )
        reference_flag = True
    else:
        _, test_image_embeddings = generate_image_embeddings(
            model["image_encoder"],
            test_dataset,
            dataset_pred_map=lambda x, y: x['image'],
            dataset_ref_map=lambda x, y: y | {'image path': x['image path']}
        )
    _, test_text_embeddings = generate_text_embeddings(
        model["text_encoder"],
        test_dataset,
        dataset_pred_map=lambda x, y: y['caption'],
        dataset_ref_map=lambda x, y: y | {'image path': x['image path']}
    )
    
    embeddings.append({
        "image": test_image_embeddings,
        "text": test_text_embeddings,
    })
    
# Compute relevance for all the test queries in the dataset
tot_relevant_cap = compute_total_relevance(dataset_reference, reference_preprocess=reference_preprocess_cap, save_to_file=False, fileinfo=relevance_fileinfo_cap | {"split": "test"})
tot_relevant_con = compute_total_relevance(dataset_reference, reference_preprocess=reference_preprocess_con_hash, relevance=concept_relevance_hash, save_to_file=False, fileinfo=relevance_fileinfo_con | {"split": "test"})

# Define model comparison labels
model_labels = [{"id": model["name"] + f"({str(id(model['clip']))})", "label": model["name"]} for model in models]

Generating image embeddings
Generating text embeddings
Generating image embeddings
Generating text embeddings
The relevance file "./relevanceTotRelevant_0.2_0_test_cap.csv" does not exist!
Proceeding with total relevance calculation...
The relevance file "./relevanceTotRelevant_0.2_0_test_con_conthresh-2.csv" does not exist!
Proceeding with total relevance calculation...


100%|██████████| 14403/14403 [01:12<00:00, 197.86it/s]


## Text to Image Task

In [18]:
results = []
test_queries = test_dataset.map(lambda x, y: y["caption"])
for model, embedding in zip(models, embeddings):
    print(f"\n### Scoring test data for {model['name']} ###")
    test_image_embeddings = embedding["image"]
    # Compute matching results and extrapolate relevant matches based on different criterions
    test_raw_results = find_t2i_matches(test_queries, model["text_encoder"], test_image_embeddings, k=k, normalize=True)
    test_results = index_to_reference(test_raw_results, dataset_reference)
    test_relevant_cap = compute_relevant_at_k(test_results, dataset_reference, k=k, reference_preprocess=reference_preprocess_cap)
    test_relevant_con = compute_relevant_at_k(test_results, dataset_reference, k=k, reference_preprocess=reference_preprocess_con, relevance=concept_relevance)
    
    results.append({
        "results": test_results,
        "relevant_cap": test_relevant_cap,
        "relevant_con": test_relevant_con,
    })


### Scoring test data for v4.0 ###
Computing Text-to-Image matches

### Scoring test data for v5.0 ###
Computing Text-to-Image matches


#### Caption equality relevance metric

In [19]:
for model in results:
    _ = retrieval_report(
        model["results"], dataset_reference, model["relevant_cap"], tot_relevant_cap,
        k=k,
        metrics=metrics,
        title=f"Test Data - Caption equality metrics @ k={k}",
        decimal_precision=decimal_precision
    )


 ### Test Data - Caption equality metrics @ k=10 ###
Accuracy                      :    10.3873%   Baseline:     0.0785%
Mean Average Precision        :     1.0429%   Baseline:     0.0079%
Mean Average Recall           :    10.2732%   Baseline:       0.06%
F1 Score                      :     1.8936%   Baseline:     0.0139%

 ### Test Data - Caption equality metrics @ k=10 ###
Accuracy                      :    15.4788%   Baseline:     0.0785%
Mean Average Precision        :     1.5629%   Baseline:     0.0079%
Mean Average Recall           :    14.8184%   Baseline:       0.06%
F1 Score                      :     2.8276%   Baseline:     0.0139%


In [20]:
_ = retrieval_graph_compare(
    [model["results"] for model in results], dataset_reference, model_labels, tot_relevant_cap,
    k_range=(1, k),
    metrics=metrics, 
    titlexyf=("k", None, "Test Data - Caption Equality model comparison"),
    reference_preprocess=reference_preprocess_cap
)

### Concept overlap relevance metric

In [21]:
for model in results:
    _ = retrieval_report(
        model["results"], dataset_reference, model["relevant_con"], tot_relevant_con,
        k=k,
        metrics=metrics,
        title=f"Test Data - Concept overlap metrics @ k={k}",
        decimal_precision=decimal_precision
    )


 ### Test Data - Concept overlap metrics @ k=10 ###
Accuracy                      :    77.4842%   Baseline:    26.6456%
Mean Average Precision        :     32.326%   Baseline:     3.7382%
Mean Average Recall           :     0.7177%   Baseline:       0.06%
F1 Score                      :     1.4042%   Baseline:     0.1182%

 ### Test Data - Concept overlap metrics @ k=10 ###
Accuracy                      :    80.9787%   Baseline:    26.6456%
Mean Average Precision        :    33.9562%   Baseline:     3.7382%
Mean Average Recall           :     0.7607%   Baseline:       0.06%
F1 Score                      :      1.488%   Baseline:     0.1182%


In [22]:
_ = retrieval_graph_compare(
    [model["results"] for model in results], dataset_reference, model_labels, tot_relevant_con,
    k_range=(1, k),
    metrics=metrics, 
    titlexyf=("k", None, "Test Data - Concept Overlap model comparison"),
    relevance=concept_relevance,
    reference_preprocess=reference_preprocess_con
)

## Image to Text

In [23]:
results = []
test_queries = test_dataset.map(lambda x, y: x["image"])
for model, embedding in zip(models, embeddings):
    print(f"\n### Scoring test data for {model['name']} ###")
    test_text_embeddings = embedding["text"]
    # Compute matching results and extrapolate relevant matches based on different criterions
    test_raw_results = find_i2t_matches(test_queries, model["image_encoder"], test_text_embeddings, k=k, normalize=True)
    test_results = index_to_reference(test_raw_results, dataset_reference)
    test_relevant_cap = compute_relevant_at_k(test_results, dataset_reference, k=k, reference_preprocess=reference_preprocess_cap)
    test_relevant_con = compute_relevant_at_k(test_results, dataset_reference, k=k, reference_preprocess=reference_preprocess_con, relevance=concept_relevance)
    
    results.append({
        "results": test_results,
        "relevant_cap": test_relevant_cap,
        "relevant_con": test_relevant_con,
    })


### Scoring test data for v4.0 ###
Computing Image-to-Text matches

### Scoring test data for v5.0 ###
Computing Image-to-Text matches


### Caption equality relevance metric

In [24]:
for model in results:
    _ = retrieval_report(
        model["results"], dataset_reference, model["relevant_cap"], tot_relevant_cap,
        k=k,
        metrics=metrics,
        title=f"Test Data - Caption equality metrics @ k={k}",
        decimal_precision=decimal_precision
    )


 ### Test Data - Caption equality metrics @ k=10 ###
Accuracy                      :    11.0537%   Baseline:     0.0785%
Mean Average Precision        :      1.124%   Baseline:     0.0079%
Mean Average Recall           :    11.0469%   Baseline:       0.06%
F1 Score                      :     2.0404%   Baseline:     0.0139%

 ### Test Data - Caption equality metrics @ k=10 ###
Accuracy                      :    15.5449%   Baseline:     0.0785%
Mean Average Precision        :     1.5893%   Baseline:     0.0079%
Mean Average Recall           :    15.5312%   Baseline:       0.06%
F1 Score                      :     2.8835%   Baseline:     0.0139%


In [25]:
_ = retrieval_graph_compare(
    [model["results"] for model in results], dataset_reference, model_labels, tot_relevant_cap,
    k_range=(1, k),
    metrics=metrics, 
    titlexyf=("k", None, "Test Data - Caption Equality model comparison"),
    reference_preprocess=reference_preprocess_cap
)

### Concept overlap relevance metric

In [26]:
for model in results:
    _ = retrieval_report(
        model["results"], dataset_reference, model["relevant_con"], tot_relevant_con,
        k=k,
        metrics=metrics,
        title=f"Test Data - Concept overlap metrics @ k={k}",
        decimal_precision=decimal_precision
    )


 ### Test Data - Concept overlap metrics @ k=10 ###
Accuracy                      :    74.9024%   Baseline:    26.6456%
Mean Average Precision        :    32.9607%   Baseline:     3.7382%
Mean Average Recall           :     0.7226%   Baseline:       0.06%
F1 Score                      :     1.4141%   Baseline:     0.1182%

 ### Test Data - Concept overlap metrics @ k=10 ###
Accuracy                      :    77.2621%   Baseline:    26.6456%
Mean Average Precision        :    34.5566%   Baseline:     3.7382%
Mean Average Recall           :     0.7541%   Baseline:       0.06%
F1 Score                      :     1.4761%   Baseline:     0.1182%


In [27]:
_ = retrieval_graph_compare(
    [model["results"] for model in results], dataset_reference, model_labels, tot_relevant_con,
    k_range=(1, k),
    metrics=metrics, 
    titlexyf=("k", None, "Test Data - Concept Overlap model comparison"),
    relevance=concept_relevance,
    reference_preprocess=reference_preprocess_con
)