In [None]:
import pandas as pd

# Load data
queries = pd.read_csv("data/queries.csv")
docs = pd.read_csv("data/corpus_trec.csv")
ground = pd.read_csv("data/groundtruth.csv")

# Merge ground truth with query text
data = ground.merge(queries, on="QueryID", how="left")
data.head()

In [None]:
data = data.merge(docs, left_on="relevant docs", right_on="DocID", how="left")
data.head()

In [None]:
data.shape

In [None]:
query_doc_pairs = data[["Query", "text"]]
query_doc_pairs.head()

In [None]:
query_doc_pairs.to_csv("data/query_doc_pairs.csv", index=False)

In [None]:
import pandas as pd 

query_doc_pairs = pd.read_csv("data/query_doc_pairs.csv")

In [None]:
from sklearn.model_selection import train_test_split

# Example: split into 500 train and 500 test
train_pairs, test_pairs = train_test_split(
    query_doc_pairs,
    train_size=500,
    test_size=500,
    random_state=42,
    shuffle=True
)

print(f"Train size: {len(train_pairs)}, Test size: {len(test_pairs)}")
train_pairs.to_csv("data/train_pairs.csv", index=False)
test_pairs.to_csv("data/test_pairs.csv", index=False)

In [16]:
import pandas as pd

# Load your data
test_pairs = pd.read_csv("data/train_pairs.csv")      # file with Query, text
queries = pd.read_csv("data/queries.csv")            # file with QueryID, Query

queries = queries.drop_duplicates("Query")
test_queries = queries.merge(test_pairs, on="Query", how="inner")[["QueryID", "Query"]]

# Save to a new file
test_queries.to_csv("data/train_queries.csv", index=False)

print(f"Saved {len(test_queries)} train queries.")


Saved 500 train queries.


In [1]:
import pandas as pd 

train_pairs = pd.read_csv("data/train_pairs.csv")

In [2]:
train_pairs = list(zip(train_pairs["Query"], train_pairs["text"]))

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

MODEL_NAME = "google/muril-base-cased"

# 1. Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
print("Model and tokenizer loaded.")

# 2. Dataset
class ContrastiveDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs  # [(text1, text2), ...]
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        return self.pairs[idx]

train_data = ContrastiveDataset(train_pairs)
train_loader = DataLoader(train_data, batch_size=128, shuffle=True, random_seed=42)
print("DataLoader created.")

# 3. Mean pooling helper
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# 4. Contrastive loss
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.05):
        super().__init__()
        self.temperature = temperature
        self.cos_sim = nn.CosineSimilarity(dim=-1)

    def forward(self, emb1, emb2):
        # Normalize
        emb1 = nn.functional.normalize(emb1, p=2, dim=1)
        emb2 = nn.functional.normalize(emb2, p=2, dim=1)

        logits = torch.matmul(emb1, emb2.T) / self.temperature
        labels = torch.arange(len(emb1)).to(emb1.device)
        loss_i = nn.functional.cross_entropy(logits, labels)
        loss_j = nn.functional.cross_entropy(logits.T, labels)
        return (loss_i + loss_j) / 2

loss_fn = ContrastiveLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
print("Loss function and optimizer set.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 5. Training loop
print("Starting training...")
model.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        t1 = [x[0] for x in batch]
        t2 = [x[1] for x in batch]

        inputs1 = tokenizer(list(t1), padding=True, truncation=True, return_tensors='pt').to(device)
        inputs2 = tokenizer(list(t2), padding=True, truncation=True, return_tensors='pt').to(device)

        out1 = model(**inputs1)
        out2 = model(**inputs2)

        emb1 = mean_pooling(out1, inputs1['attention_mask'])
        emb2 = mean_pooling(out2, inputs2['attention_mask'])

        loss = loss_fn(emb1, emb2)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}: loss = {total_loss/len(train_loader):.4f}")

# 6. Save fine-tuned model
model.save_pretrained("muril-contrastive-newari")
tokenizer.save_pretrained("muril-contrastive-newari")


Model and tokenizer loaded.
DataLoader created.
Loss function and optimizer set.
Starting training...
Epoch 1: loss = 0.6636
Epoch 2: loss = 0.4940
Epoch 3: loss = 0.1072
Epoch 4: loss = 0.0173
Epoch 5: loss = 0.0021


('muril-contrastive-newari/tokenizer_config.json',
 'muril-contrastive-newari/special_tokens_map.json',
 'muril-contrastive-newari/vocab.txt',
 'muril-contrastive-newari/added_tokens.json',
 'muril-contrastive-newari/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

MODEL_NAME = "sundeepdwd/muril-mlm-newa-finetuned"

# 1. Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
print("Model and tokenizer loaded.")

# 2. Dataset
class ContrastiveDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs  # [(text1, text2), ...]
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        return self.pairs[idx]

train_data = ContrastiveDataset(train_pairs)
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
print("DataLoader created.")

# 3. Mean pooling helper
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# 4. Contrastive loss
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.05):
        super().__init__()
        self.temperature = temperature
        self.cos_sim = nn.CosineSimilarity(dim=-1)

    def forward(self, emb1, emb2):
        # Normalize
        emb1 = nn.functional.normalize(emb1, p=2, dim=1)
        emb2 = nn.functional.normalize(emb2, p=2, dim=1)

        logits = torch.matmul(emb1, emb2.T) / self.temperature
        labels = torch.arange(len(emb1)).to(emb1.device)
        loss_i = nn.functional.cross_entropy(logits, labels)
        loss_j = nn.functional.cross_entropy(logits.T, labels)
        return (loss_i + loss_j) / 2

loss_fn = ContrastiveLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
print("Loss function and optimizer set.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 5. Training loop
print("Starting training...")
model.train()
for epoch in range(10):
    total_loss = 0
    for batch in train_loader:
        t1 = [x[0] for x in batch]
        t2 = [x[1] for x in batch]

        inputs1 = tokenizer(list(t1), padding=True, truncation=True, return_tensors='pt').to(device)
        inputs2 = tokenizer(list(t2), padding=True, truncation=True, return_tensors='pt').to(device)

        out1 = model(**inputs1)
        out2 = model(**inputs2)

        emb1 = mean_pooling(out1, inputs1['attention_mask'])
        emb2 = mean_pooling(out2, inputs2['attention_mask'])

        loss = loss_fn(emb1, emb2)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}: loss = {total_loss/len(train_loader):.4f}")

# 6. Save fine-tuned model
model.save_pretrained("muril-pretrained-contrastive-newari")
tokenizer.save_pretrained("muril-pretrained-contrastive-newari")


Some weights of BertModel were not initialized from the model checkpoint at sundeepdwd/muril-mlm-newa-finetuned and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded.
DataLoader created.
Loss function and optimizer set.
