In [1]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/semeval/SemEval2025/test.tsv
/kaggle/input/semeval/SemEval2025/clip1.py
/kaggle/input/semeval/SemEval2025/train.tsv
/kaggle/input/semeval/SemEval2025/CS779-Project5-AdMIRe-MidTerm-Presentation-5.pdf
/kaggle/input/semeval/SemEval2025/2022.csv
/kaggle/input/semeval/SemEval2025/Subtask B/subtask_b_sample.tsv
/kaggle/input/semeval/SemEval2025/Subtask B/new blood/15529605996.png
/kaggle/input/semeval/SemEval2025/Subtask B/new blood/17543802828.png
/kaggle/input/semeval/SemEval2025/Subtask B/new blood/s2_18056618867.png
/kaggle/input/semeval/SemEval2025/Subtask B/new blood/14839290985.png
/kaggle/input/semeval/SemEval2025/Subtask B/new blood/19106979831.png
/kaggle/input/semeval/SemEval2025/Subtask B/new blood/s1_18703651931.png
/kaggle/input/semeval/SemEval2025/Subtask B/new blood/13567540079.png
/kaggle/input/semeval/SemEval2025/Subtask B/panda car/12588591958.png
/kaggle/input/semeval/SemEval2025/Subtask B/panda car/s2_16640394825.png
/kaggle/input/semeval/SemEval2025/Subtas

In [8]:
import pandas as pd
import torch
from torch import nn, optim
from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm


class ContrastiveDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sent1, sent2, sim = row['sentence_1'], row['sentence_2'], row['sim']
        sim = 1 if sim == 1 else 0

        encoded1 = self.tokenizer(
            sent1,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        encoded2 = self.tokenizer(
            sent2,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        
        return (
            encoded1['input_ids'].squeeze(0),
            encoded1['attention_mask'].squeeze(0),
            encoded2['input_ids'].squeeze(0),
            encoded2['attention_mask'].squeeze(0),
            torch.tensor(sim, dtype=torch.float32)
        )


class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        self.cosine_similarity = nn.CosineSimilarity(dim=1)

    def forward(self, output1, output2, label):
        similarity = self.cosine_similarity(output1, output2)
        pos_loss = label * torch.exp(-similarity)
        neg_loss = (1 - label) * torch.exp(similarity - self.margin)
        return torch.mean(pos_loss + neg_loss)


sep_token = "[SEP]"

def insert_sep_inside_sentence(sentence, compound):
    if compound in sentence:
        return sentence.replace(compound, f"{sep_token} {compound} {sep_token}")
    return sentence


def train_model(model, dataloader, optimizer, loss_fn, device, epochs=5, save_path="distilbert_model.bin"):
    model.train()
    model.to(device)

    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids1, attention_mask1, input_ids2, attention_mask2, labels = [b.to(device) for b in batch]
            outputs1 = model(input_ids1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]
            outputs2 = model(input_ids2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]
            loss = loss_fn(outputs1, outputs2, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
        torch.save(model.state_dict(), f"{save_path}_epoch_{epoch+1}.bin")
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")


def main():
    df = pd.read_csv('/kaggle/input/semeval/SemEval2025/2022.csv')
    df['sentence_1'] = df.apply(lambda row: insert_sep_inside_sentence(row['sentence_1'], row['MWE1']), axis=1)
    model = DistilBertModel.from_pretrained("distilbert-base-uncased")
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    dataset = ContrastiveDataset(df, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
    loss_fn = ContrastiveLoss(margin=1.0)
    optimizer = optim.AdamW(model.parameters(), lr=5e-5)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    save_path = "distilbert_model_contrastive.bin"
    train_model(model, dataloader, optimizer, loss_fn, device, epochs=10, save_path=save_path)

if __name__ == "__main__":
    main()


Epoch 1/10: 100%|██████████| 411/411 [01:28<00:00,  4.67it/s]


Epoch 1/10, Loss: 0.5155


Epoch 2/10: 100%|██████████| 411/411 [01:28<00:00,  4.66it/s]


Epoch 2/10, Loss: 0.3909


Epoch 3/10: 100%|██████████| 411/411 [01:28<00:00,  4.67it/s]


Epoch 3/10, Loss: 0.3518


Epoch 4/10: 100%|██████████| 411/411 [01:28<00:00,  4.67it/s]


Epoch 4/10, Loss: 0.3321


Epoch 5/10: 100%|██████████| 411/411 [01:28<00:00,  4.67it/s]


Epoch 5/10, Loss: 0.3246


Epoch 6/10: 100%|██████████| 411/411 [01:28<00:00,  4.67it/s]


Epoch 6/10, Loss: 0.3158


Epoch 7/10: 100%|██████████| 411/411 [01:28<00:00,  4.67it/s]


Epoch 7/10, Loss: 0.3127


Epoch 8/10: 100%|██████████| 411/411 [01:28<00:00,  4.67it/s]


Epoch 8/10, Loss: 0.3142


Epoch 9/10: 100%|██████████| 411/411 [01:28<00:00,  4.67it/s]


Epoch 9/10, Loss: 0.3053


Epoch 10/10: 100%|██████████| 411/411 [01:28<00:00,  4.67it/s]


Epoch 10/10, Loss: 0.2998
Model saved to distilbert_model_contrastive.bin


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel, DistilBertTokenizer, DistilBertModel
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm

image_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
text_model = DistilBertModel.from_pretrained("distilbert-base-uncased") 
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

##save_path = "distilbert_model_contrastive.bin"  
##text_model.load_state_dict(torch.load(save_path, map_location="cpu")) 
##print(f"Loaded trained text model from {save_path}")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_model.to(device)
text_model.to(device)


image_embedding_reduction = nn.Linear(768, 512).to(device)  

text_embedding_reduction = nn.Linear(768, 512).to(device)

class ImageTextDataset(Dataset):
    def __init__(self, sentences, image_paths, processor, tokenizer, max_len=77):
        self.sentences = sentences
        self.image_paths = image_paths
        self.processor = processor
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = self.sentences[idx]
        image_paths = self.image_paths[5 * idx: 5 * idx + 5]  
        images = [Image.open(path).convert("RGB") for path in image_paths]
        
        encoded_text = self.tokenizer(text, max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt")
     
        image_inputs = self.processor(images=images, return_tensors="pt", padding=True)
       
        return encoded_text, image_inputs

class PlackettLuceLoss(nn.Module):
    def __init__(self):
        super(PlackettLuceLoss, self).__init__()

    def forward(self, scores, true_order):
        """
        Computes the Plackett-Luce loss.

        Args:
        - scores: Tensor of shape (batch_size, num_items), predicted scores for each item.
        - true_order: Tensor of shape (batch_size, num_items), true rankings or order.

        Returns:
        - loss: Scalar loss value.
        """
        loss = 0
        for batch_idx in range(scores.size(0)):
            current_scores = scores[batch_idx]
            current_order = true_order[batch_idx]

            # Sort scores and corresponding ground truth
            sorted_scores = current_scores[current_order]
            total_prob = 0

            for rank, score in enumerate(sorted_scores):
                numerator = torch.exp(score)
                denominator = torch.sum(torch.exp(sorted_scores[rank:]))
                total_prob += torch.log(numerator / denominator)

            loss -= total_prob

        return loss / scores.size(0)

df = pd.read_csv('/kaggle/input/semeval/SemEval2025/train.tsv', sep="\t")
ipath = []
sentences = []

for compound in df['compound']:
    ch = "".join(char for char in compound if char != " ")
    path = f'/kaggle/input/semeval/SemEval2025/train/{ch}'
    ipath.extend([os.path.join(path, img) for img in os.listdir(path) if img.endswith(".png")])

sentences.extend(df['sentence'])

dataset = ImageTextDataset(sentences, ipath, processor, tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

pl_loss = PlackettLuceLoss()
optimizer = torch.optim.AdamW(list(image_model.parameters()) + list(text_model.parameters()), lr=2e-6)

epochs = 20
for epoch in range(epochs):
    image_model.train()
    text_model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
       
        text_inputs, image_inputs = batch
        
      
        input_ids = text_inputs["input_ids"].squeeze(0).to(device) 
        attention_mask = text_inputs["attention_mask"].squeeze(0).to(device)
        
       
        pixel_values = image_inputs["pixel_values"].squeeze(0).to(device) 

        text_model.to(device)
        image_model.to(device)

        text_outputs = text_model(input_ids=input_ids, attention_mask=attention_mask)

        text_embeddings = text_outputs.last_hidden_state[:, 0, :]  
        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
        text_embeddings = text_embedding_reduction(text_embeddings) 
        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
        image_outputs = image_model.vision_model(pixel_values=pixel_values)
        image_embeddings = image_outputs[0]  
      
        image_embeddings = image_embeddings[:, 0, :]  
        image_embeddings = image_embeddings.to(device) 
     
        image_embeddings = image_embedding_reduction(image_embeddings)  
        image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)

       
        print(f"Text Embeddings Shape: {text_embeddings.shape}")
        print(f"Image Embeddings Shape: {image_embeddings.shape}")

       
        similarities = torch.matmul(text_embeddings, image_embeddings.T)  
        similarities = similarities.squeeze(0)  

        
        true_order = torch.argsort(similarities, descending=True)

   
        loss = pl_loss(similarities.unsqueeze(0), true_order.unsqueeze(0))
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

   
    torch.save(image_model.state_dict(), f"clip_image_model_epoch_{epoch+1}.bin")
    torch.save(text_model.state_dict(), f"distilbert_text_model_epoch_{epoch+1}.bin")

In [7]:
import torch
import os
from PIL import Image
from tqdm import tqdm
import pandas as pd
from transformers import DistilBertTokenizer
from transformers import CLIPProcessor, CLIPModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel, DistilBertTokenizer, DistilBertModel

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

image_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
text_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

df2 = pd.read_csv('/kaggle/input/semeval/SemEval2025/test.tsv', sep="\t")

ipath2 = []
sentences2 = []

for i, row in df2.iterrows():
    compound = row['compound']
    expected_order = row['expected_order']
    ch = "".join(char for char in compound if char != " ")
    base_path = f'/kaggle/input/semeval/SemEval2025/train/{ch}'

    if os.path.exists(base_path):
        expected_order = expected_order.strip("[]").replace("'", "").split(",")
        for img_filename in expected_order:
            img_filename = img_filename.strip()
            full_path = os.path.join(base_path, img_filename)
            if os.path.isfile(full_path):
                ipath2.append(full_path)
    sentences2.append(row['sentence'])

def calculate_top1_and_top2_accuracy():
    image_model.eval()
    text_model.eval()

    correct_top1 = 0
    correct_top2 = 0
    total = 0
    text_projection_layer = torch.nn.Linear(768, 512).to(device)

    with torch.no_grad():
        for i, sentence in enumerate(tqdm(sentences2, desc="Evaluating on Test Data")):
            text_inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
            text_outputs = text_model(**text_inputs)
            text_embeddings = text_outputs.last_hidden_state[:, 0, :]
            text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
            text_embeddings = text_projection_layer(text_embeddings)

            image_start_idx = 5 * i
            image_end_idx = image_start_idx + 4
            selected_image_paths = ipath2[image_start_idx:image_end_idx + 1]

            image_embeddings_list = []
            for path in selected_image_paths:
                image = Image.open(path).convert("RGB")
                pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
                image_embeddings = image_model.get_image_features(pixel_values=pixel_values)
                image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)
                image_embeddings_list.append(image_embeddings)

            image_embeddings = torch.cat(image_embeddings_list, dim=0)
            similarities = torch.matmul(text_embeddings, image_embeddings.T)
            similarities = similarities.squeeze(0)

            ranked_indices = torch.argsort(similarities, descending=True)

            if ranked_indices[0] == 0:
                correct_top1 += 1
            if 0 in ranked_indices[:2]:
                correct_top2 += 1

            total += 1

    top1_accuracy = correct_top1 / total * 100
    top2_accuracy = correct_top2 / total * 100
    return top1_accuracy, top2_accuracy

top1_accuracy, top2_accuracy = calculate_top1_and_top2_accuracy()
print(f"Top-1 Accuracy on Test Data: {top1_accuracy:.2f}%")
print(f"Top-2 Accuracy on Test Data: {top2_accuracy:.2f}%")


Evaluating on Test Data: 100%|██████████| 15/15 [00:03<00:00,  4.56it/s]

Top-1 Accuracy on Test Data: 13.33%
Top-2 Accuracy on Test Data: 40.00%



