<a href="https://colab.research.google.com/github/sahandtebyani/Multi-Modal/blob/main/Multi_Modal_Contrastive_Loss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from transformers import BertModel, BertTokenizer
from PIL import Image
from torch.utils.data import DataLoader, Dataset

# Step 1: Dataset with image-text pairs
class ImageTextDataset(Dataset):
    def __init__(self, image_paths, captions, transform, tokenizer):
        self.image_paths = image_paths
        self.captions = captions
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.captions)

    def __getitem__(self, idx):
        # Load and preprocess image
        image = Image.open(self.image_paths[idx]).convert('RGB')
        image = self.transform(image)

        # Tokenize caption
        caption = self.captions[idx]
        tokens = self.tokenizer(caption, padding='max_length', max_length=32, truncation=True, return_tensors='pt')
        return image, tokens['input_ids'].squeeze(0), tokens['attention_mask'].squeeze(0)

# Example data (replace with real dataset)
image_paths = ["path_to_image1.jpg", "path_to_image2.jpg"]
captions = ["A man riding a bike.", "A dog playing with a ball."]

# Image preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Text preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset and dataloader
dataset = ImageTextDataset(image_paths, captions, transform, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Step 2: Models
# Image encoder (ResNet)
image_encoder = models.resnet18(pretrained=True)
image_encoder.fc = nn.Identity()  # Remove classification head

# Text encoder (BERT)
text_encoder = BertModel.from_pretrained('bert-base-uncased')

# Cross-Attention Layer
class CrossAttention(nn.Module):
    def __init__(self, feature_dim):
        super(CrossAttention, self).__init__()
        self.query_layer = nn.Linear(feature_dim, feature_dim)
        self.key_layer = nn.Linear(feature_dim, feature_dim)
        self.value_layer = nn.Linear(feature_dim, feature_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, text_features, image_features):
        # Cross-attention: text attends to image
        queries = self.query_layer(text_features)
        keys = self.key_layer(image_features)
        values = self.value_layer(image_features)
        attention_scores = torch.matmul(queries, keys.transpose(-1, -2))
        attention_weights = self.softmax(attention_scores)
        attended_features = torch.matmul(attention_weights, values)
        return attended_features

# Combined model
class MultiModalModel(nn.Module):
    def __init__(self, image_encoder, text_encoder, feature_dim):
        super(MultiModalModel, self).__init__()
        self.image_encoder = image_encoder
        self.text_encoder = text_encoder
        self.cross_attention = CrossAttention(feature_dim)

    def forward(self, images, input_ids, attention_masks):
        # Extract image features
        image_features = self.image_encoder(images)

        # Extract text features
        text_output = self.text_encoder(input_ids, attention_mask=attention_masks)
        text_features = text_output.last_hidden_state  # (batch_size, seq_len, feature_dim)

        # Apply cross-attention
        combined_features = self.cross_attention(text_features, image_features.unsqueeze(1))
        return combined_features

model = MultiModalModel(image_encoder, text_encoder, feature_dim=512)

# Step 3: Define loss and optimizer
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature

    def forward(self, text_embeddings, image_embeddings):
        # Normalize embeddings
        text_embeddings = nn.functional.normalize(text_embeddings, p=2, dim=-1)
        image_embeddings = nn.functional.normalize(image_embeddings, p=2, dim=-1)

        # Compute similarity
        similarity_matrix = torch.matmul(text_embeddings, image_embeddings.T)

        # Contrastive loss
        positives = torch.diag(similarity_matrix)
        loss = -torch.log(torch.exp(positives / self.temperature) /
                          torch.exp(similarity_matrix / self.temperature).sum(dim=-1))
        return loss.mean()

contrastive_loss = ContrastiveLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Step 4: Training loop
for epoch in range(5):  # Example for 5 epochs
    for images, input_ids, attention_masks in dataloader:
        # Forward pass
        combined_features = model(images, input_ids, attention_masks)

        # Compute loss (dummy example with the same combined_features for text and image)
        loss = contrastive_loss(combined_features[:, 0, :], combined_features[:, 0, :])

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")