# TFIDF RUN

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from collections import defaultdict
import re

In [12]:
# !pip install sentencepiece

In [13]:
# import sentencepiece as spm

_DEVANAGARI_PATTERN = re.compile(r'[\u0900-\u0963\u0966-\u097F]+')

with open("data/stopwords.txt", "r", encoding="utf-8") as f:
    _STOPWORDS = set(line.strip() for line in f)

# sp = spm.SentencePieceProcessor(model_file='models/newari_tokenizer.model')

def tokenize(text):
    # extract words but ignore ред and рее
    words = _DEVANAGARI_PATTERN.findall(text)
    # words = sp.encode_as_pieces(text)
    return [word for word in words if word not in _STOPWORDS]

def compute_tfidf(corpus):
    vectorizer = TfidfVectorizer( tokenizer=tokenize, token_pattern=None, use_idf=True, sublinear_tf=True)

    tfidf_matrix = vectorizer.fit_transform(corpus["text"].tolist())

    return tfidf_matrix, vectorizer

def search(query, tfidf_matrix, vectorizer, top_k=10):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    return [(idx, similarities[idx]) for idx in top_indices]

import joblib

def save_tfidf(tfidf_matrix, vectorizer, folder_path="models"):
    """
    Save TF-IDF matrix and vectorizer to disk.
    """
    import os
    os.makedirs(folder_path, exist_ok=True)
    
    # Save vectorizer
    joblib.dump(vectorizer, f"{folder_path}/vectorizer.joblib")
    # Save TF-IDF matrix
    joblib.dump(tfidf_matrix, f"{folder_path}/tfidf_matrix.joblib")
    
    print(f"TF-IDF matrix and vectorizer saved to folder: {folder_path}")

def load_tfidf(folder_path="models"):
    """
    Load TF-IDF matrix and vectorizer from disk.
    """
    import os
    import joblib
    
    vectorizer = joblib.load(f"{folder_path}/vectorizer.joblib")
    tfidf_matrix = joblib.load(f"{folder_path}/tfidf_matrix.joblib")
    
    print(f"TF-IDF matrix and vectorizer loaded from folder: {folder_path}")
    return tfidf_matrix, vectorizer


In [14]:
TOP_K = 20
RUN_FILE = "data/tfidf_run.json"
QUERIES_FILE = "data/queries.csv"
CORPUS_FILE = 'data/corpus_trec.csv'

# 1. Load Data and Compute TF-IDF
corpus = pd.read_csv(CORPUS_FILE)

# The DocID column must be correctly named in your corpus.csv
doc_ids = corpus['DocID'].tolist() 

tfidf_matrix, vectorizer = compute_tfidf(corpus)

In [15]:
# save_tfidf(tfidf_matrix, vectorizer, folder_path="models")

In [16]:
# 2. Load Queries
queries = pd.read_csv(QUERIES_FILE)
results = defaultdict()
for qid, query in queries.iterrows():
    results[query['QueryID']] = search(query['Query'], tfidf_matrix, vectorizer, TOP_K)

In [17]:
# converting indexes to docids
doc_ids = corpus['DocID'].tolist()

for query in results:
    results[query] = [doc_ids[r[0]] for r in results[query]]

In [18]:
# saving the run in a json file
import json

outputfile = "data/tfidf.json"

with open(outputfile, 'w', encoding='utf-8') as f:
    json.dump(results,
              f,
              indent=4,
              ensure_ascii=False
             )

In [19]:
import pandas as pd
from collections import defaultdict

ground_truth = pd.read_csv('data/groundtruth.csv')
file_path = "data/tfidf.json"

with open(file_path, 'r') as file_handle:
    # 2. Use json.load() (without the 's') to convert the file content to a dict
    predictions = json.load(file_handle)
    
# Build ground truth dictionary
gt_dict = {}
for _, row in ground_truth.iterrows():
    gt_dict[row['QueryID']] = row['relevant docs']

# Calculate metrics
def calculate_metrics(predictions, ground_truth, k_values=[1, 5, 10, 20]):
    results = {f'top{k}': [] for k in k_values}
    ap_scores = []
    
    for query_id, ranked_docs in predictions.items():
        relevant_doc = ground_truth.get(query_id)
        
        if relevant_doc is None:
            continue
            
        # Top-k accuracy
        for k in k_values:
            top_k_docs = ranked_docs[:k]
            results[f'top{k}'].append(1 if relevant_doc in top_k_docs else 0)
        
        # Average Precision
        if relevant_doc in ranked_docs[:5]:
            position = ranked_docs.index(relevant_doc) + 1  # 1-indexed
            ap = 1.0 / position
        else:
            ap = 0.0
        ap_scores.append(ap)
    
    # Calculate averages
    metrics = {}
    for k in k_values:
        metrics[f'Top-{k} Accuracy'] = sum(results[f'top{k}']) / len(results[f'top{k}'])
    metrics['MAP'] = sum(ap_scores) / len(ap_scores)
    
    return metrics

# Calculate and print results
metrics = calculate_metrics(predictions, gt_dict)

print("Evaluation Metrics:")
print("-" * 40)
for metric_name, score in metrics.items():
    print(f"{metric_name}: {score:.4f} ({score*100:.2f}%)")

Evaluation Metrics:
----------------------------------------
Top-1 Accuracy: 0.7430 (74.30%)
Top-5 Accuracy: 0.8890 (88.90%)
Top-10 Accuracy: 0.9110 (91.10%)
Top-20 Accuracy: 0.9270 (92.70%)
MAP: 0.8036 (80.36%)


# Muril Run

In [20]:
import pandas as pd
import torch
import torch.nn.functional as F
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel

In [21]:
TOP_K = 20
QUERIES_FILE = "data/queries.csv"
RUN_FILE = "data/muril.json"
CORPUS_FILE = 'data/corpus_trec.csv'
MODEL_NAME = "sundeepdwd/muril-mlm-newa-finetuned"

In [22]:
def get_mean_pooling(model_output, attention_mask):
    """Mean pooling to get a single vector representation for the sequence."""
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# 1. Load Model and Data
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

model.to(device)

corpus = pd.read_csv(CORPUS_FILE)
doc_ids = corpus['DocID'].tolist() 
doc_texts = corpus['text'].tolist()

# NOTE: This step is memory intensive. For 80,000+ docs, pre-calculate embeddings once.
print("Computing document embeddings (can take time for large corpus)...")
doc_embeddings = []

# Batch processing is recommended for 80k+ documents, but for simplicity:
for text in doc_texts:
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    tokens.to(device)
    with torch.no_grad():
        output = model(**tokens)
    doc_embeddings.append(get_mean_pooling(output, tokens['attention_mask']))
    
all_doc_embeddings = torch.cat(doc_embeddings)

# 2. Load Queries
queries = pd.read_csv(QUERIES_FILE)
results = defaultdict()
for idx, query in queries.iterrows():
    qid = query['QueryID']
    title = query['Query'] # Use the title as the query text

    # 4. Compute Query Embedding
    query_tokens = tokenizer(title, return_tensors='pt', truncation=True, padding=True)
    query_tokens.to(device)
    with torch.no_grad():
        query_output = model(**query_tokens)
    query_embedding = get_mean_pooling(query_output, query_tokens['attention_mask'])
    
    # 5. Calculate Cosine Similarity
    # Compare the query embedding to all document embeddings
    # The result is a 1D tensor of scores
    scores = F.cosine_similarity(query_embedding, all_doc_embeddings)
    
    # 6. Sort and Select Top K
    top_scores, top_indices = torch.topk(scores, k=min(TOP_K, len(scores)))
    
    # Retrieve the corresponding DocIDs
    top_doc_ids = [doc_ids[i.item()] for i in top_indices]
    
    # Save to dick
    results[qid] = top_doc_ids
    print(f"Processed Query {qid} with {len(top_doc_ids)} results.")
    

Using device: cuda


Some weights of BertModel were not initialized from the model checkpoint at sundeepdwd/muril-mlm-newa-finetuned and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Computing document embeddings (can take time for large corpus)...
Processed Query Q0001 with 20 results.
Processed Query Q0002 with 20 results.
Processed Query Q0003 with 20 results.
Processed Query Q0004 with 20 results.
Processed Query Q0005 with 20 results.
Processed Query Q0006 with 20 results.
Processed Query Q0007 with 20 results.
Processed Query Q0008 with 20 results.
Processed Query Q0009 with 20 results.
Processed Query Q0010 with 20 results.
Processed Query Q0011 with 20 results.
Processed Query Q0012 with 20 results.
Processed Query Q0013 with 20 results.
Processed Query Q0014 with 20 results.
Processed Query Q0015 with 20 results.
Processed Query Q0016 with 20 results.
Processed Query Q0017 with 20 results.
Processed Query Q0018 with 20 results.
Processed Query Q0019 with 20 results.
Processed Query Q0020 with 20 results.
Processed Query Q0021 with 20 results.
Processed Query Q0022 with 20 results.
Processed Query Q0023 with 20 results.
Processed Query Q0024 with 20 results

In [23]:
# saving the run in a json file
import json

outputfile = "data/muril.json"

with open(outputfile, 'w', encoding='utf-8') as f:
    json.dump(results,
              f,
              indent=4,
              ensure_ascii=False
             )

# Calculating the Top@1 accuray, Top@5 accuracy and Mean Average Precision

In [24]:
import pandas as pd
from collections import defaultdict

ground_truth = pd.read_csv('data/groundtruth.csv')
file_path = "data/muril.json"

with open(file_path, 'r') as file_handle:
    # 2. Use json.load() (without the 's') to convert the file content to a dict
    predictions = json.load(file_handle)
    
# Build ground truth dictionary
gt_dict = {}
for _, row in ground_truth.iterrows():
    gt_dict[row['QueryID']] = row['relevant docs']

# Calculate metrics
def calculate_metrics(predictions, ground_truth, k_values=[1, 5,10]):
    results = {f'top{k}': [] for k in k_values}
    ap_scores = []
    
    for query_id, ranked_docs in predictions.items():
        relevant_doc = ground_truth.get(query_id)
        
        if relevant_doc is None:
            continue
            
        # Top-k accuracy
        for k in k_values:
            top_k_docs = ranked_docs[:k]
            results[f'top{k}'].append(1 if relevant_doc in top_k_docs else 0)
        
        # Average Precision
        if relevant_doc in ranked_docs[:5]:
            position = ranked_docs.index(relevant_doc) + 1  # 1-indexed
            ap = 1.0 / position
        else:
            ap = 0.0
        ap_scores.append(ap)
    
    # Calculate averages
    metrics = {}
    for k in k_values:
        metrics[f'Top-{k} Accuracy'] = sum(results[f'top{k}']) / len(results[f'top{k}'])
    metrics['MAP'] = sum(ap_scores) / len(ap_scores)
    
    return metrics

# Calculate and print results
metrics = calculate_metrics(predictions, gt_dict)

print("Evaluation Metrics:")
print("-" * 40)
for metric_name, score in metrics.items():
    print(f"{metric_name}: {score:.4f} ({score*100:.2f}%)")

Evaluation Metrics:
----------------------------------------
Top-1 Accuracy: 0.0120 (1.20%)
Top-5 Accuracy: 0.0260 (2.60%)
Top-10 Accuracy: 0.0350 (3.50%)
MAP: 0.0176 (1.76%)


# Tasks to do 
- Tf idf is zeroshot is good with acc .....
- Just baseline finetuned on mlm sundeepdwd/muril has only 1,2,3% accuracy of @1, @5, @10

- average token embedding method doesnt work well for retrival 

- Finetune contrastive learning on sundeepdwd/muril
- Finetune " on english/hindi only on "

- zero shot eval on embedding gemma

## to read 
- max content length - parameter impact ra solution 

# Google Muril with contrastive fine tuning

In [25]:
import pandas as pd
import torch
import torch.nn.functional as F
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel

TOP_K = 20
QUERIES_FILE = "data/test_queries.csv"
RUN_FILE = "data/google_muril_fine.json"
CORPUS_FILE = 'data/corpus_trec.csv'
MODEL_NAME = "muril-contrastive-newari"

def get_mean_pooling(model_output, attention_mask):
    """Mean pooling to get a single vector representation for the sequence."""
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# 1. Load Model and Data
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

model.to(device)

corpus = pd.read_csv(CORPUS_FILE)
doc_ids = corpus['DocID'].tolist() 
doc_texts = corpus['text'].tolist()

# NOTE: This step is memory intensive. For 80,000+ docs, pre-calculate embeddings once.
print("Computing document embeddings (can take time for large corpus)...")
doc_embeddings = []

# Batch processing is recommended for 80k+ documents, but for simplicity:
for text in doc_texts:
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    tokens.to(device)
    with torch.no_grad():
        output = model(**tokens)
    doc_embeddings.append(get_mean_pooling(output, tokens['attention_mask']))
    
all_doc_embeddings = torch.cat(doc_embeddings)

# 2. Load Queries
queries = pd.read_csv(QUERIES_FILE)
results = defaultdict()
for idx, query in queries.iterrows():
    qid = query['QueryID']
    title = query['Query'] # Use the title as the query text

    # 4. Compute Query Embedding
    query_tokens = tokenizer(title, return_tensors='pt', truncation=True, padding=True)
    query_tokens.to(device)
    with torch.no_grad():
        query_output = model(**query_tokens)
    query_embedding = get_mean_pooling(query_output, query_tokens['attention_mask'])
    
    # 5. Calculate Cosine Similarity
    # Compare the query embedding to all document embeddings
    # The result is a 1D tensor of scores
    scores = F.cosine_similarity(query_embedding, all_doc_embeddings)
    
    # 6. Sort and Select Top K
    top_scores, top_indices = torch.topk(scores, k=min(TOP_K, len(scores)))
    
    # Retrieve the corresponding DocIDs
    top_doc_ids = [doc_ids[i.item()] for i in top_indices]
    
    # Save to dick
    results[qid] = top_doc_ids
    print(f"Processed Query {qid} with {len(top_doc_ids)} results.")
    
# saving the run in a json file
import json

outputfile = "data/muril_google_fine_tune.json"

with open(outputfile, 'w', encoding='utf-8') as f:
    json.dump(results,
              f,
              indent=4,
              ensure_ascii=False
             )

import pandas as pd
from collections import defaultdict

ground_truth = pd.read_csv('data/groundtruth.csv')
file_path = "data/muril_google_fine_tune.json"

with open(file_path, 'r') as file_handle:
    # 2. Use json.load() (without the 's') to convert the file content to a dict
    predictions = json.load(file_handle)
    
# Build ground truth dictionary
gt_dict = {}
for _, row in ground_truth.iterrows():
    gt_dict[row['QueryID']] = row['relevant docs']

# Calculate metrics
def calculate_metrics(predictions, ground_truth, k_values=[1, 5, 10, 20]):
    results = {f'top{k}': [] for k in k_values}
    ap_scores = []
    
    for query_id, ranked_docs in predictions.items():
        relevant_doc = ground_truth.get(query_id)
        
        if relevant_doc is None:
            continue
            
        # Top-k accuracy
        for k in k_values:
            top_k_docs = ranked_docs[:5]
            results[f'top{k}'].append(1 if relevant_doc in top_k_docs else 0)
        
        # Average Precision
        if relevant_doc in ranked_docs:
            position = ranked_docs.index(relevant_doc) + 1  # 1-indexed
            ap = 1.0 / position
        else:
            ap = 0.0
        ap_scores.append(ap)
    
    # Calculate averages
    metrics = {}
    for k in k_values:
        metrics[f'Top-{k} Accuracy'] = sum(results[f'top{k}']) / len(results[f'top{k}'])
    metrics['MAP'] = sum(ap_scores) / len(ap_scores)
    
    return metrics

# Calculate and print results
metrics = calculate_metrics(predictions, gt_dict)

print("Evaluation Metrics:")
print("-" * 40)
for metric_name, score in metrics.items():
    print(f"{metric_name}: {score:.4f} ({score*100:.2f}%)")

Using device: cuda
Computing document embeddings (can take time for large corpus)...
Processed Query Q0001 with 20 results.
Processed Query Q0003 with 20 results.
Processed Query Q0004 with 20 results.
Processed Query Q0006 with 20 results.
Processed Query Q0007 with 20 results.
Processed Query Q0008 with 20 results.
Processed Query Q0010 with 20 results.
Processed Query Q0011 with 20 results.
Processed Query Q0012 with 20 results.
Processed Query Q0013 with 20 results.
Processed Query Q0019 with 20 results.
Processed Query Q0024 with 20 results.
Processed Query Q0025 with 20 results.
Processed Query Q0026 with 20 results.
Processed Query Q0029 with 20 results.
Processed Query Q0030 with 20 results.
Processed Query Q0031 with 20 results.
Processed Query Q0032 with 20 results.
Processed Query Q0034 with 20 results.
Processed Query Q0040 with 20 results.
Processed Query Q0042 with 20 results.
Processed Query Q0043 with 20 results.
Processed Query Q0044 with 20 results.
Processed Query Q0

# Muril Fine tuned contrastive

In [26]:
import pandas as pd
import torch
import torch.nn.functional as F
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel

TOP_K = 20
QUERIES_FILE = "data/test_queries.csv"
RUN_FILE = "data/google_muril_fine.json"
CORPUS_FILE = 'data/corpus_trec.csv'
MODEL_NAME = "muril-pretrained-contrastive-newari"

def get_mean_pooling(model_output, attention_mask):
    """Mean pooling to get a single vector representation for the sequence."""
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# 1. Load Model and Data
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

model.to(device)

corpus = pd.read_csv(CORPUS_FILE)
doc_ids = corpus['DocID'].tolist() 
doc_texts = corpus['text'].tolist()

# NOTE: This step is memory intensive. For 80,000+ docs, pre-calculate embeddings once.
print("Computing document embeddings (can take time for large corpus)...")
doc_embeddings = []

# Batch processing is recommended for 80k+ documents, but for simplicity:
for text in doc_texts:
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    tokens.to(device)
    with torch.no_grad():
        output = model(**tokens)
    doc_embeddings.append(get_mean_pooling(output, tokens['attention_mask']))
    
all_doc_embeddings = torch.cat(doc_embeddings)

# 2. Load Queries
queries = pd.read_csv(QUERIES_FILE)
results = defaultdict()
for idx, query in queries.iterrows():
    qid = query['QueryID']
    title = query['Query'] # Use the title as the query text

    # 4. Compute Query Embedding
    query_tokens = tokenizer(title, return_tensors='pt', truncation=True, padding=True)
    query_tokens.to(device)
    with torch.no_grad():
        query_output = model(**query_tokens)
    query_embedding = get_mean_pooling(query_output, query_tokens['attention_mask'])
    
    # 5. Calculate Cosine Similarity
    # Compare the query embedding to all document embeddings
    # The result is a 1D tensor of scores
    scores = F.cosine_similarity(query_embedding, all_doc_embeddings)
    
    # 6. Sort and Select Top K
    top_scores, top_indices = torch.topk(scores, k=min(TOP_K, len(scores)))
    
    # Retrieve the corresponding DocIDs
    top_doc_ids = [doc_ids[i.item()] for i in top_indices]
    
    # Save to dick
    results[qid] = top_doc_ids
    print(f"Processed Query {qid} with {len(top_doc_ids)} results.")
    
# saving the run in a json file
import json

outputfile = "data/muril_pretrain_fine_tune.json"

with open(outputfile, 'w', encoding='utf-8') as f:
    json.dump(results,
              f,
              indent=4,
              ensure_ascii=False
             )

import pandas as pd
from collections import defaultdict

ground_truth = pd.read_csv('data/groundtruth.csv')
file_path = "data/muril_pretrain_fine_tune.json"

with open(file_path, 'r') as file_handle:
    # 2. Use json.load() (without the 's') to convert the file content to a dict
    predictions = json.load(file_handle)
    
# Build ground truth dictionary
gt_dict = {}
for _, row in ground_truth.iterrows():
    gt_dict[row['QueryID']] = row['relevant docs']

# Calculate metrics
def calculate_metrics(predictions, ground_truth, k_values=[1, 5, 10, 20]):
    results = {f'top{k}': [] for k in k_values}
    ap_scores = []
    
    for query_id, ranked_docs in predictions.items():
        relevant_doc = ground_truth.get(query_id)
        
        if relevant_doc is None:
            continue
            
        # Top-k accuracy
        for k in k_values:
            top_k_docs = ranked_docs[:5]
            results[f'top{k}'].append(1 if relevant_doc in top_k_docs else 0)
        
        # Average Precision
        if relevant_doc in ranked_docs:
            position = ranked_docs.index(relevant_doc) + 1  # 1-indexed
            ap = 1.0 / position
        else:
            ap = 0.0
        ap_scores.append(ap)
    
    # Calculate averages
    metrics = {}
    for k in k_values:
        metrics[f'Top-{k} Accuracy'] = sum(results[f'top{k}']) / len(results[f'top{k}'])
    metrics['MAP'] = sum(ap_scores) / len(ap_scores)
    
    return metrics

# Calculate and print results
metrics = calculate_metrics(predictions, gt_dict)

print("Evaluation Metrics:")
print("-" * 40)
for metric_name, score in metrics.items():
    print(f"{metric_name}: {score:.4f} ({score*100:.2f}%)")

Using device: cuda
Computing document embeddings (can take time for large corpus)...
Processed Query Q0001 with 20 results.
Processed Query Q0003 with 20 results.
Processed Query Q0004 with 20 results.
Processed Query Q0006 with 20 results.
Processed Query Q0007 with 20 results.
Processed Query Q0008 with 20 results.
Processed Query Q0010 with 20 results.
Processed Query Q0011 with 20 results.
Processed Query Q0012 with 20 results.
Processed Query Q0013 with 20 results.
Processed Query Q0019 with 20 results.
Processed Query Q0024 with 20 results.
Processed Query Q0025 with 20 results.
Processed Query Q0026 with 20 results.
Processed Query Q0029 with 20 results.
Processed Query Q0030 with 20 results.
Processed Query Q0031 with 20 results.
Processed Query Q0032 with 20 results.
Processed Query Q0034 with 20 results.
Processed Query Q0040 with 20 results.
Processed Query Q0042 with 20 results.
Processed Query Q0043 with 20 results.
Processed Query Q0044 with 20 results.
Processed Query Q0