# Model

> Dev notebook

In [1]:
# | default_exp models

In [2]:
# | hide
%reload_ext autoreload
%reload_ext nb_black
%autoreload 2
%env CUDA_VISIBLE_DEVICES=

from nbdev.showdoc import *
import sys

__root = "../"
sys.path.append(__root)


env: CUDA_VISIBLE_DEVICES=


<IPython.core.display.Javascript object>

In [3]:
# | export
from torch_snippets import *
from clip.core import *

<IPython.core.display.Javascript object>

In [4]:
# | export
import timm
from transformers import DistilBertModel, DistilBertConfig

<IPython.core.display.Javascript object>

In [6]:
#| export
class ImageEncoder(nn.Module):
    """
    Encode images to a fixed size vector
    """

    def __init__(self, config):
        super().__init__()
        self.model = timm.create_model(
            config.model_name, config.pretrained, num_classes=0, global_pool="avg"
        )
        for p in self.model.parameters():
            p.requires_grad = config.trainable

    def forward(self, x):
        return self.model(x)


class TextEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        if config.pretrained:
            self.model = DistilBertModel.from_pretrained(config.distilbert_text_encoder_model)
        else:
            self.model = DistilBertModel(config=DistilBertConfig())
        for p in self.model.parameters():
            p.requires_grad = config.trainable
        self.target_token_idx = config.target_token_idx

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = output.last_hidden_state
        return last_hidden_state[:, self.target_token_idx, :]


class ProjectionHead(nn.Module):
    def __init__(self, embedding_dim, config):
        super().__init__()
        self.projection = nn.Linear(embedding_dim, config.projection_dim)
        self.gelu = nn.GELU()
        self.fc = nn.Linear(config.projection_dim, config.projection_dim)
        self.dropout = nn.Dropout(config.dropout)
        self.layer_norm = nn.LayerNorm(config.projection_dim)

    def forward(self, x):
        projected = self.projection(x)
        x = self.gelu(projected)
        x = self.fc(x)
        x = self.dropout(x)
        x = x + projected
        x = self.layer_norm(x)
        return x


class CLIP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.image_encoder = ImageEncoder(config)
        self.text_encoder = TextEncoder(config)
        self.image_projection = ProjectionHead(config.image_embedding, config)
        self.text_projection = ProjectionHead(config.text_embedding, config)
        self.temperature = config.temperature

    def forward(self, image, input_ids, attention_mask):
        # Getting Image and Text Features
        image_features = self.image_encoder(image)
        text_features = self.text_encoder(
            input_ids=input_ids, attention_mask=attention_mask
        )
        # Getting Image and Text Embeddings (with same dimension)
        image_embeddings = self.image_projection(image_features)
        text_embeddings = self.text_projection(text_features)
        # Calculating the Loss
        logits = (text_embeddings @ image_embeddings.T) / self.temperature
        images_similarity = image_embeddings @ image_embeddings.T
        texts_similarity = text_embeddings @ text_embeddings.T
        targets = F.softmax(
            (images_similarity + texts_similarity) / 2 * self.temperature, dim=-1
        )
        texts_loss = cross_entropy(logits, targets, reduction="none")
        images_loss = cross_entropy(logits.T, targets.T, reduction="none")
        loss = (images_loss + texts_loss) / 2.0  # shape: (batch_size)
        return {'loss': loss.mean()}
    
    @classmethod
    def from_pretrained(cls, folder, config):
        model = cls(config)
        load_torch_model_weights_to(model, P(folder)/'pytorch_model.bin')
        return model
        

def cross_entropy(preds, targets, reduction="none"):
    log_softmax = nn.LogSoftmax(dim=-1)
    loss = (-targets * log_softmax(preds)).sum(1)
    if reduction == "none":
        return loss
    elif reduction == "mean":
        return loss.mean()


<IPython.core.display.Javascript object>

In [8]:
# | hide
import nbdev
nbdev.nbdev_export()
import subprocess

subprocess.run(["/home/yyr/anaconda3/envs/mcvp-book/bin/black", __root])


Skipping .ipynb files as Jupyter dependencies are not installed.
You can fix this by running ``pip install "black[jupyter]"``
reformatted /mnt/347832F37832B388/projects/MCVP2e/Chapter-15b/CLIP/clip/core.py
reformatted /mnt/347832F37832B388/projects/MCVP2e/Chapter-15b/CLIP/clip/config.py
reformatted /mnt/347832F37832B388/projects/MCVP2e/Chapter-15b/CLIP/clip/_modidx.py
reformatted /mnt/347832F37832B388/projects/MCVP2e/Chapter-15b/CLIP/clip/dataset.py
reformatted /mnt/347832F37832B388/projects/MCVP2e/Chapter-15b/CLIP/clip/models.py

All done! ✨ 🍰 ✨
5 files reformatted, 3 files left unchanged.


CompletedProcess(args=['/home/yyr/anaconda3/envs/mcvp-book/bin/black', '../'], returncode=0)

<IPython.core.display.Javascript object>