# TFIDF RUN

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from collections import defaultdict
import re

In [2]:
_DEVANAGARI_PATTERN = re.compile(r'[\u0900-\u0963\u0966-\u097F]+')

with open("data/stopwords.txt", "r", encoding="utf-8") as f:
    _STOPWORDS = set(line.strip() for line in f)

def tokenize(text):
    # extract words but ignore ред and рее
    words = _DEVANAGARI_PATTERN.findall(text)
    normalized = [word.replace("\u0940", "\u093F").replace("\u0942", "\u0941") for word in words]
    return [word for word in normalized if word not in _STOPWORDS]

def compute_tfidf(corpus):
    vectorizer = TfidfVectorizer( tokenizer=tokenize, token_pattern=None, use_idf=True, sublinear_tf=True)

    tfidf_matrix = vectorizer.fit_transform(corpus["text"].tolist())

    return tfidf_matrix, vectorizer

def search(query, tfidf_matrix, vectorizer, top_k=10):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    return [(idx, similarities[idx]) for idx in top_indices]

In [3]:
TOP_K = 10
RUN_FILE = "data/tfidf_run.json"
QUERIES_FILE = "data/queries.csv"
CORPUS_FILE = 'data/corpus_trec.csv'

# 1. Load Data and Compute TF-IDF
corpus = pd.read_csv(CORPUS_FILE)

# The DocID column must be correctly named in your corpus.csv
doc_ids = corpus['DocID'].tolist() 

tfidf_matrix, vectorizer = compute_tfidf(corpus)

In [4]:
# 2. Load Queries
queries = pd.read_csv(QUERIES_FILE)
results = defaultdict()
for qid, query in queries.iterrows():
    results[query['QueryID']] = search(query['Query'], tfidf_matrix, vectorizer, TOP_K)

In [5]:
# converting indexes to docids
doc_ids = corpus['DocID'].tolist()

for query in results:
    results[query] = [doc_ids[r[0]] for r in results[query]]

print(results)

defaultdict(None, {'Q0001': ['NWA02278', 'NWA73714', 'NWA02057', 'NWA07971', 'NWA04154', 'NWA02161', 'NWA06069', 'NWA06955', 'NWA06671', 'NWA35375'], 'Q0002': ['NWA42460', 'NWA41512', 'NWA13395', 'NWA41235', 'NWA43004', 'NWA12906', 'NWA12293', 'NWA40732', 'NWA13220', 'NWA42900'], 'Q0003': ['NWA78084', 'NWA70055', 'NWA78312', 'NWA74001', 'NWA07085', 'NWA12419', 'NWA07355', 'NWA78410', 'NWA07325', 'NWA73779'], 'Q0004': ['NWA13817', 'NWA74044', 'NWA10048', 'NWA77203', 'NWA77194', 'NWA29710', 'NWA74082', 'NWA01957', 'NWA09784', 'NWA07474'], 'Q0005': ['NWA31379', 'NWA34729', 'NWA27768', 'NWA27785', 'NWA27730', 'NWA27757', 'NWA27763', 'NWA31403', 'NWA27815', 'NWA27729'], 'Q0006': ['NWA13355', 'NWA29710', 'NWA11583', 'NWA42986', 'NWA39866', 'NWA39933', 'NWA41091', 'NWA74788', 'NWA11425', 'NWA12165'], 'Q0007': ['NWA53394', 'NWA78312', 'NWA74001', 'NWA78410', 'NWA73779', 'NWA73777', 'NWA73776', 'NWA73836', 'NWA73778', 'NWA07355'], 'Q0008': ['NWA28562', 'NWA74696', 'NWA02358', 'NWA75540', 'NWA75

In [6]:
# saving the run in a json file
import json

outputfile = "data/tfidf.json"

with open(outputfile, 'w', encoding='utf-8') as f:
    json.dump(results,
              f,
              indent=4,
              ensure_ascii=False
             )

# Muril Run

In [8]:
import pandas as pd
import torch
import torch.nn.functional as F
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel

In [10]:
TOP_K = 10
QUERIES_FILE = "data/queries.csv"
RUN_FILE = "data/muril.json"
CORPUS_FILE = 'data/corpus_trec.csv'
MODEL_NAME = "sundeepdwd/muril-mlm-newa-finetuned"

In [11]:
def get_mean_pooling(model_output, attention_mask):
    """Mean pooling to get a single vector representation for the sequence."""
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# 1. Load Model and Data
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

model.to(device)

corpus = pd.read_csv(CORPUS_FILE)
doc_ids = corpus['DocID'].tolist() 
doc_texts = corpus['text'].tolist()

# NOTE: This step is memory intensive. For 80,000+ docs, pre-calculate embeddings once.
print("Computing document embeddings (can take time for large corpus)...")
doc_embeddings = []

# Batch processing is recommended for 80k+ documents, but for simplicity:
for text in doc_texts:
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    tokens.to(device)
    with torch.no_grad():
        output = model(**tokens)
    doc_embeddings.append(get_mean_pooling(output, tokens['attention_mask']))
    
all_doc_embeddings = torch.cat(doc_embeddings)

# 2. Load Queries
queries = pd.read_csv(QUERIES_FILE)
results = defaultdict()
for idx, query in queries.iterrows():
    qid = query['QueryID']
    title = query['Query'] # Use the title as the query text

    # 4. Compute Query Embedding
    query_tokens = tokenizer(title, return_tensors='pt', truncation=True, padding=True)
    query_tokens.to(device)
    with torch.no_grad():
        query_output = model(**query_tokens)
    query_embedding = get_mean_pooling(query_output, query_tokens['attention_mask'])
    
    # 5. Calculate Cosine Similarity
    # Compare the query embedding to all document embeddings
    # The result is a 1D tensor of scores
    scores = F.cosine_similarity(query_embedding, all_doc_embeddings)
    
    # 6. Sort and Select Top K
    top_scores, top_indices = torch.topk(scores, k=min(TOP_K, len(scores)))
    
    # Retrieve the corresponding DocIDs
    top_doc_ids = [doc_ids[i.item()] for i in top_indices]
    
    # Save to dick
    results[qid] = top_doc_ids
    print(f"Processed Query {qid} with {len(top_doc_ids)} results.")
    

Using device: cuda


Some weights of BertModel were not initialized from the model checkpoint at sundeepdwd/muril-mlm-newa-finetuned and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Computing document embeddings (can take time for large corpus)...
Processed Query Q0001 with 10 results.
Processed Query Q0002 with 10 results.
Processed Query Q0003 with 10 results.
Processed Query Q0004 with 10 results.
Processed Query Q0005 with 10 results.
Processed Query Q0006 with 10 results.
Processed Query Q0007 with 10 results.
Processed Query Q0008 with 10 results.
Processed Query Q0009 with 10 results.
Processed Query Q0010 with 10 results.
Processed Query Q0011 with 10 results.
Processed Query Q0012 with 10 results.
Processed Query Q0013 with 10 results.
Processed Query Q0014 with 10 results.
Processed Query Q0015 with 10 results.
Processed Query Q0016 with 10 results.
Processed Query Q0017 with 10 results.
Processed Query Q0018 with 10 results.
Processed Query Q0019 with 10 results.
Processed Query Q0020 with 10 results.
Processed Query Q0021 with 10 results.
Processed Query Q0022 with 10 results.
Processed Query Q0023 with 10 results.
Processed Query Q0024 with 10 results

In [12]:
print(results)

defaultdict(None, {'Q0001': ['NWA32554', 'NWA15911', 'NWA31765', 'NWA74717', 'NWA55684', 'NWA53358', 'NWA32132', 'NWA11263', 'NWA44914', 'NWA44937'], 'Q0002': ['NWA28648', 'NWA11425', 'NWA39137', 'NWA74695', 'NWA77695', 'NWA74515', 'NWA73716', 'NWA08916', 'NWA74704', 'NWA77275'], 'Q0003': ['NWA27852', 'NWA27863', 'NWA27835', 'NWA27823', 'NWA27831', 'NWA27828', 'NWA27830', 'NWA27829', 'NWA27824', 'NWA27827'], 'Q0004': ['NWA74708', 'NWA78594', 'NWA77305', 'NWA08650', 'NWA74760', 'NWA09761', 'NWA09222', 'NWA77695', 'NWA73614', 'NWA73707'], 'Q0005': ['NWA15368', 'NWA15503', 'NWA65193', 'NWA57504', 'NWA77333', 'NWA78428', 'NWA12430', 'NWA31907', 'NWA71523', 'NWA26257'], 'Q0006': ['NWA11425', 'NWA74843', 'NWA74894', 'NWA73716', 'NWA12143', 'NWA74515', 'NWA08916', 'NWA22037', 'NWA39137', 'NWA28648'], 'Q0007': ['NWA75577', 'NWA80268', 'NWA80281', 'NWA80275', 'NWA80276', 'NWA80274', 'NWA76584', 'NWA79781', 'NWA76199', 'NWA79764'], 'Q0008': ['NWA53603', 'NWA74718', 'NWA77305', 'NWA22883', 'NWA74

In [13]:
# saving the run in a json file
import json

outputfile = "data/muril.json"

with open(outputfile, 'w', encoding='utf-8') as f:
    json.dump(results,
              f,
              indent=4,
              ensure_ascii=False
             )

# Calculating the Top@1 accuray, Top@5 accuracy and Mean Average Precision

In [16]:
import pandas as pd
from collections import defaultdict

ground_truth = pd.read_csv('data/groundtruth.csv')
file_path = "data/tfidf.json"

with open(file_path, 'r') as file_handle:
    # 2. Use json.load() (without the 's') to convert the file content to a dict
    predictions = json.load(file_handle)
    
# Build ground truth dictionary
gt_dict = {}
for _, row in ground_truth.iterrows():
    gt_dict[row['QueryID']] = row['relevant docs']

# Calculate metrics
def calculate_metrics(predictions, ground_truth, k_values=[1, 5]):
    results = {f'top{k}': [] for k in k_values}
    ap_scores = []
    
    for query_id, ranked_docs in predictions.items():
        relevant_doc = ground_truth.get(query_id)
        
        if relevant_doc is None:
            continue
            
        # Top-k accuracy
        for k in k_values:
            top_k_docs = ranked_docs[:k]
            results[f'top{k}'].append(1 if relevant_doc in top_k_docs else 0)
        
        # Average Precision
        if relevant_doc in ranked_docs:
            position = ranked_docs.index(relevant_doc) + 1  # 1-indexed
            ap = 1.0 / position
        else:
            ap = 0.0
        ap_scores.append(ap)
    
    # Calculate averages
    metrics = {}
    for k in k_values:
        metrics[f'Top-{k} Accuracy'] = sum(results[f'top{k}']) / len(results[f'top{k}'])
    metrics['MAP'] = sum(ap_scores) / len(ap_scores)
    
    return metrics

# Calculate and print results
metrics = calculate_metrics(predictions, gt_dict)

print("Evaluation Metrics:")
print("-" * 40)
for metric_name, score in metrics.items():
    print(f"{metric_name}: {score:.4f} ({score*100:.2f}%)")

Evaluation Metrics:
----------------------------------------
Top-1 Accuracy: 0.7230 (72.30%)
Top-5 Accuracy: 0.8860 (88.60%)
MAP: 0.7948 (79.48%)


In [17]:
import pandas as pd
from collections import defaultdict

ground_truth = pd.read_csv('data/groundtruth.csv')
file_path = "data/muril.json"

with open(file_path, 'r') as file_handle:
    # 2. Use json.load() (without the 's') to convert the file content to a dict
    predictions = json.load(file_handle)
    
# Build ground truth dictionary
gt_dict = {}
for _, row in ground_truth.iterrows():
    gt_dict[row['QueryID']] = row['relevant docs']

# Calculate metrics
def calculate_metrics(predictions, ground_truth, k_values=[1, 5]):
    results = {f'top{k}': [] for k in k_values}
    ap_scores = []
    
    for query_id, ranked_docs in predictions.items():
        relevant_doc = ground_truth.get(query_id)
        
        if relevant_doc is None:
            continue
            
        # Top-k accuracy
        for k in k_values:
            top_k_docs = ranked_docs[:k]
            results[f'top{k}'].append(1 if relevant_doc in top_k_docs else 0)
        
        # Average Precision
        if relevant_doc in ranked_docs:
            position = ranked_docs.index(relevant_doc) + 1  # 1-indexed
            ap = 1.0 / position
        else:
            ap = 0.0
        ap_scores.append(ap)
    
    # Calculate averages
    metrics = {}
    for k in k_values:
        metrics[f'Top-{k} Accuracy'] = sum(results[f'top{k}']) / len(results[f'top{k}'])
    metrics['MAP'] = sum(ap_scores) / len(ap_scores)
    
    return metrics

# Calculate and print results
metrics = calculate_metrics(predictions, gt_dict)

print("Evaluation Metrics:")
print("-" * 40)
for metric_name, score in metrics.items():
    print(f"{metric_name}: {score:.4f} ({score*100:.2f}%)")

Evaluation Metrics:
----------------------------------------
Top-1 Accuracy: 0.0120 (1.20%)
Top-5 Accuracy: 0.0260 (2.60%)
MAP: 0.0186 (1.86%)
