In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import torch

In [3]:
exps = {
    "base": "Baseline",
    "elr": "ELR",
    "sel": "Selective Loss"
}

In [12]:
# Train Epoch: (\d+) $$ ?\d+/\d+ $(\d+)%$$$
# image_to_text_mean_rank: ([\d.]+)\simage_to_text_median_rank: ([\d.]+)\simage_to_text_R@1: ([\d.]+)\simage_to_text_R@5: ([\d.]+)\simage_to_text_R@10: ([\d.]+)\stext_to_image_mean_rank: ([\d.]+)\stext_to_image_median_rank: ([\d.]+)\stext_to_image_R@1: ([\d.]+)\stext_to_image_R@5: ([\d.]+)\stext_to_image_R@10: ([\d.]+)\sclip_val_loss: ([\d.]+).*val_generative_loss: ([\d.]+)


In [14]:
# Training
contrastive_loss = {exp: [] for exp in exps}
caption_loss = {exp: [] for exp in exps}
# Eval
clip_loss = {exp: [] for exp in exps}
generative_loss = {exp: [] for exp in exps}
rank_keys = ["mean", "median", "1", "5", "10"]
img_text_metrics = {f"img_text_{rk}": {exp: [] for exp in exps} for rk in rank_keys}
text_img_metrics = {f"text_img_{rk}": {exp: [] for exp in exps} for rk in rank_keys}
imagenet_metrics = {f"top{rk}": {exp: [] for exp in exps} for rk in [1, 5]}

In [15]:
def read_out_log(file_path, exp):
    with open(file_path, "r") as f:
        lines = f.readlines()
        for i in range(len(lines)):
            if "(100%)" in lines[i] and "Train" in lines[i]:
                losses = re.search(r"Contrastive_loss: ([\d.]+).*Caption_loss: ([\d.]+)", lines[i])
                contrastive_loss[exp].append(int(losses.group(1)))
                caption_loss[exp].append(int(losses.group(2)))
            elif "@" in lines[i]:
                if not eval_losses:
                    print("Darn")
                    print(lines[i])
                for rk in rank_keys:
                    if rk.isnumeric():
                        img_text_metrics[f"img_text_{rk}"][exp].append(int(re.search(r"image_to_text_R@{}: ([\d.]+)".format(rk), lines[i]).group(1)))
                        text_img_metrics[f"text_img_{rk}"][exp].append(int(re.search(r"text_to_image_R@{}: ([\d.]+)".format(rk), lines[i]).group(1)))
                    else:
                        img_text_metrics[f"img_text_{rk}"][exp].append(int(re.search(r"image_to_text_{}_rank: ([\d.]+)".format(rk), lines[i]).group(1)))
                        text_img_metrics[f"text_img_{rk}"][exp].append(int(re.search(r"text_to_image_{}_rank: ([\d.]+)".format(rk), lines[i]).group(1)))
                clip_loss[exp].append(int(re.search(r"clip_val_loss: ([\d.]+)", lines[i]).group(1)))
                generative_loss[exp].append(int(re.search(r"val_generative_loss: ([\d.]+)", lines[i]).group(1)))
                if "imagenet" in lines[i]:
                    for rk in [1, 5]:
                        imagenet_metrics[f"top{rk}"][exp].append(int(re.search(r"-val-top{}: ([\d.]+)".format(rk), lines[i]).group(1)))


In [None]:
def load_chkpt(file_path, exp):
    checkpoint = torch.load(file_path)
    epoch = checkpoint["epoch"]
    
model_state_dict = checkpoint['model_state_dict']

# Load the state dict into the model (assuming model is predefined and has the same architecture)
model.load_state_dict(model_state_dict)

# Extract the optimizer state dictionary
optimizer_state_dict = checkpoint['optimizer_state_dict']

# Load the state dict into the optimizer (assuming optimizer is predefined and set up with the same parameters)
optimizer.load_state_dict(optimizer_state_dict)

# Extract the loss (if saved)
loss = checkpoint['loss']

# Print the loaded information (optional)
print(f"Loaded checkpoint for epoch {epoch} with loss {loss}.")


Certainly! To explain these concepts effectively to a graduate-level class, it's crucial to start with foundational ideas and then build towards more complex and technical descriptions. Here’s how you can structure your explanation for **contrastive loss**, **caption loss**, **generative loss**, and **CLIP loss**:

### 1. **Contrastive Loss**

#### Basic Concept:
Contrastive loss is used in scenarios where the goal is to learn by comparing—specifically, to distinguish between similar and dissimilar items. In the context of machine learning, particularly in tasks involving embeddings (representations), the contrastive loss function helps to ensure that similar items are mapped closer together and dissimilar items are mapped farther apart in the embedding space.

#### Technical Details:
A common use of contrastive loss is in siamese networks or triplet networks, often used for tasks such as face verification or any form of learning from pairs. In these setups:
- **Siamese networks** involve pairs of inputs. The loss calculates the distance between these pairs, pushing the distances of "positive" pairs (similar items) to be small, and "negative" pairs (dissimilar items) to be large.
- **Triplet networks** extend this by using three inputs: an anchor, a positive (similar to the anchor), and a negative (dissimilar to the anchor). The loss ensures the distance between the anchor and the negative is greater than the distance between the anchor and the positive by some margin.

The mathematical formulation often looks like this for a pair (x1, x2) with a binary label y indicating if they are similar (1) or not (0):
$$ L = y \cdot D(x1, x2)^2 + (1 - y) \cdot \max(0, m - D(x1, x2))^2 $$
where $ D $ is a distance function (like Euclidean distance), and $ m $ is a margin enforced between dissimilar pairs.

### 2. **Caption Loss**

#### Basic Concept:
Caption loss is typically found in image captioning tasks, where the model generates textual descriptions for images. The caption loss measures how well the generated text matches the expected text, helping to guide the training of models in generating accurate and relevant descriptions.

#### Technical Details:
Caption loss is commonly implemented using cross-entropy loss, which quantitatively measures the difference between the predicted probability distribution (generated caption) and the actual distribution (true caption). Cross-entropy loss is favored because it effectively handles the probabilities of a sequence of words:
$$ L = -\sum_{t=1}^T \log(p_{target_t}) $$
Here, $ T $ is the length of the caption, and $ p_{target_t} $ is the probability assigned by the model to the target word at position $ t $.

### 3. **Generative Loss**

#### Basic Concept:
Generative loss applies to generative models, which are designed to generate new data instances that resemble the training data. This could be anything from images, text, or even new music. The generative loss measures how well the model performs this task, often focusing on how realistically the model replicates the data distribution.

#### Technical Details:
In generative adversarial networks (GANs), for example, the generative loss often involves a component where the generator tries to minimize the ability of a discriminator to distinguish generated data from real data, effectively minimizing:
$$ \log(1 - D(G(z))) $$
where $ D $ is the discriminator, $ G $ is the generator, and $ z $ is a noise vector. The generator's loss is typically balanced against the discriminator's loss in a zero-sum game framework.

### 4. **CLIP Loss (Contrastive Language–Image Pre-training)**

#### Basic Concept:
CLIP loss comes from the CLIP model by OpenAI, which learns visual concepts from natural language descriptions. It uses a contrastive loss formulation to align the text and image in a shared multidimensional space, promoting similarity between corresponding text and image pairs versus non-corresponding pairs.

#### Technical Details:
CLIP models are trained on a variety of images and text pairs. The loss function aims to maximize the cosine similarity between the correct pairs of images and texts compared to incorrect ones, using a temperature-scaled cross-entropy loss:
$$ L = -\log \frac{\exp(\text{sim}(i, t) / \tau)}{\sum_{j=1}^N \exp(\text{sim}(i, j) / \tau)} $$
Here, $ \text{sim}(i, t) $ represents the similarity score (e.g., cosine similarity) between the embeddings of the image $ i $ and text $ t $, and $ \tau $ is a temperature parameter that scales the logits.

### Conclusion for Presentation
Start with the foundational concepts and the purpose of each loss type in specific tasks (contrast for learning

 discriminative features, captioning for matching descriptions, generative for creating new instances, and CLIP for aligning cross-modal data). Then delve into the mathematical expressions to show how these objectives are quantitatively implemented and optimized. This structure helps build understanding from ground up, connecting practical objectives with theoretical formulations.