In [1]:
# import pandas as pd
# from transformers import AutoTokenizer
# import numpy as np
# import matplotlib.pyplot as plt
# import os

# # --- Configuration ---
# MODEL_NAME = "google/muril-base-cased"
# CORPUS_PATH = 'data/corpus_trec.csv' # Path to your 80k corpus file

# def analyze_token_lengths(corpus_path: str):
#     # --- 1. Load Data and Tokenizer ---
#     if not os.path.exists(corpus_path):
#         print(f"Error: Corpus file not found at {corpus_path}")
#         return

#     corpus_df = pd.read_csv(corpus_path)
#     # We only care about the 'text' column
#     corpus_texts = corpus_df['text'].dropna().astype(str).tolist()

#     print(f"Loaded {len(corpus_texts)} documents for analysis.")

#     # Load the MuRIL tokenizer
#     # The 'fast' version is much quicker for batch processing
#     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
#     print(f"Loaded {MODEL_NAME} tokenizer.")

#     # --- 2. Calculate Token Length for Each Document ---

#     # We use a simple list comprehension to apply the tokenizer to all texts
#     # The length of the 'input_ids' list is the token count, including [CLS] and [SEP]
#     token_lengths = [
#         len(tokenizer.encode(text, add_special_tokens=True))
#         for text in corpus_texts
#     ]

#     # --- 3. Compute Key Statistics ---
#     token_lengths_np = np.array(token_lengths)

#     # Calculate the crucial percentiles
#     p_50 = np.percentile(token_lengths_np, 50)
#     p_75 = np.percentile(token_lengths_np, 75)
#     p_90 = np.percentile(token_lengths_np, 90)
#     p_95 = np.percentile(token_lengths_np, 95)

#     # --- 4. Print Results (The Decision Makers) ---

#     print("\n--- Document Token Length Statistics (MuRIL) ---")
#     print(f"Minimum length: {token_lengths_np.min()}")
#     print(f"Maximum length: {token_lengths_np.max()}")
#     print(f"Average length: {token_lengths_np.mean():.2f}")

#     print("\nPercentiles (What percentage of documents fit inside this length):")
#     print(f"50th Percentile (Median): {int(p_50)}")
#     print(f"75th Percentile: {int(p_75)}")
#     print(f"90th Percentile: {int(p_90)}")
#     print(f"95th Percentile: {int(p_95)}")

#     # --- 5. Optional: Plot Histogram (Visual Confirmation) ---
#     plt.figure(figsize=(10, 6))
#     # We only plot up to the 99th percentile for a clear view, ignoring massive outliers
#     plt.hist(token_lengths_np, bins=50, range=(0, np.percentile(token_lengths_np, 99)), color='skyblue', edgecolor='black')

#     # Draw a line at the typical 128-token setting
#     plt.axvline(x=128, color='red', linestyle='--', label='128 Tokens (Current Max Length)')

#     plt.title('Token Length Distribution of Corpus Documents')
#     plt.xlabel('Token Count (MuRIL Tokenizer)')
#     plt.ylabel('Number of Documents')
#     plt.legend()
#     plt.grid(axis='y', alpha=0.5)
#     plt.show()

# if __name__ == '__main__':
#     analyze_token_lengths(CORPUS_PATH)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import os

def load_data(file_path):
    """Loads CSV data."""
    print(f"Loading data from {file_path}...")
    return pd.read_csv(file_path)

def setup_and_split_data():
    # --- 1. Load Data ---
    corpus_df = load_data('data/corpus_trec.csv')
    queries_df = load_data('data/queries.csv')
    gt_df = load_data('data/groundtruth.csv') # QueryID, relevant_docs

    # --- 2. Create Lookup Tables and Relevance Data ---
    doc_id_to_text = dict(zip(corpus_df['DocID'], corpus_df['text']))
    relevance_data = pd.merge(queries_df, gt_df, on='QueryID')

    # --- 3. Split Query IDs (The new three-way split) ---
    all_qids = relevance_data['QueryID'].unique()
    total_qids = len(all_qids)

    # Desired sizes: Train (500), Eval (200), Test (300)
    TRAIN_SIZE = 500
    EVAL_SIZE = 200
    TEST_SIZE = 300

    if total_qids < (TRAIN_SIZE + EVAL_SIZE + TEST_SIZE):
        print(f"Error: Total queries ({total_qids}) is less than the desired split size (1000). Adjusting split.")
        # We will assume for the code below that total_qids is 1000 or greater.

    # Step 1: Split Train (500) and Remainder (500)
    train_qids, remainder_qids = train_test_split(
        all_qids,
        train_size=TRAIN_SIZE,
        random_state=42
    )

    # Step 2: Split Remainder (500) into Eval (200) and Test (300)
    # We calculate the required proportion for the eval set from the remainder (200/500 = 0.4)
    eval_qids, test_qids = train_test_split(
        remainder_qids,
        test_size=TEST_SIZE, # The size we want for the final test set
        train_size=EVAL_SIZE, # The size we want for the evaluation set
        random_state=42
    )

    # --- 4. Filter Relevance Data into Three Sets ---
    train_relevance = relevance_data[relevance_data['QueryID'].isin(train_qids)]
    eval_relevance = relevance_data[relevance_data['QueryID'].isin(eval_qids)]
    test_relevance = relevance_data[relevance_data['QueryID'].isin(test_qids)]

    # --- 5. Save All Metadata ---
    if not os.path.exists('data_output'):
        os.makedirs('data_output')

    train_relevance.to_csv('data_output/train_relevance.csv', index=False)
    eval_relevance.to_csv('data_output/eval_relevance.csv', index=False) # NEW FILE
    test_relevance.to_csv('data_output/test_relevance.csv', index=False)
    corpus_df.to_csv('data_output/corpus_data.csv', index=False)

    # Save the DocID to Text map as a JSON file for quick lookups
    with open('data_output/docid_to_text.json', 'w', encoding='utf-8') as f:
        json.dump(doc_id_to_text, f, ensure_ascii=False)

    print("\nData Setup Complete.")
    print(f"Training Queries: {len(train_relevance)} | Evaluation Queries: {len(eval_relevance)} | Test Queries: {len(test_relevance)}")

if __name__ == '__main__':
    setup_and_split_data()

Loading data from data/corpus_trec.csv...
Loading data from data/queries.csv...
Loading data from data/groundtruth.csv...

Data Setup Complete.
Training Queries: 500 | Evaluation Queries: 200 | Test Queries: 300


In [3]:
%pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import re
import numpy as np
from rank_bm25 import BM25Okapi
import os # Need os for file path management if not already imported

_DEVANAGARI_PATTERN = re.compile(r'[\u0900-\u0963\u0966-\u097F]+')

with open("data/stopwords.txt", "r", encoding="utf-8") as f:

    _STOPWORDS = set(line.strip() for line in f)

def simple_tokenizer(text):

    # extract words but ignore । and ॥

    words = _DEVANAGARI_PATTERN.findall(text)

    return [word for word in words if word not in _STOPWORDS]

# -----------------------------------------------------------------------------------
# NOTE: Renamed the original function to be more generic, 'mine_hard_negatives',
# and added a new 'output_path' parameter for clean file management.
# -----------------------------------------------------------------------------------

def mine_hard_negatives(relevance_path: str, corpus_path: str, output_path: str, top_k: int = 100, num_hard_negatives: int = 3):
    # --- 1. Load Data ---
    relevance_data = pd.read_csv(relevance_path) # Uses the generic relevance path
    corpus_df = pd.read_csv(corpus_path)

    corpus_texts = corpus_df['text'].tolist()
    doc_ids = corpus_df['DocID'].tolist()
    doc_id_to_index = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # --- 2. BM25 Indexing ---
    tokenized_corpus = [simple_tokenizer(doc) for doc in corpus_texts]
    bm25 = BM25Okapi(tokenized_corpus)
    print(f"BM25 index built for {len(corpus_texts)} documents.")

    # --- 3. Mining Hard Negatives ---
    triplets = []

    for _, row in relevance_data.iterrows():
        query = row['Query']
        gt_doc_id = row['relevant docs']
        positive_index = doc_id_to_index.get(gt_doc_id)

        # Skip queries if the positive document is somehow missing from the corpus index
        if positive_index is None:
             continue

        tokenized_query = simple_tokenizer(query)
        doc_scores = bm25.get_scores(tokenized_query)
        ranked_indices = np.argsort(doc_scores)[::-1]

        hn_count = 0
        # Iterate through top-K BM25 results
        for idx in ranked_indices[:top_k]:
            if idx != positive_index:
                hn_doc_id = doc_ids[idx]

                triplets.append({
                    "query_id": row['QueryID'],
                    "query": query,
                    "positive_id": gt_doc_id,
                    "negative_id": hn_doc_id,
                    # Note: These lookups will be slower but ensure data is complete
                    "positive_text": corpus_df.loc[corpus_df['DocID'] == gt_doc_id, 'text'].iloc[0],
                    "negative_text": corpus_df.loc[corpus_df['DocID'] == hn_doc_id, 'text'].iloc[0]
                })

                hn_count += 1
                if hn_count >= num_hard_negatives:
                    break

    # --- 4. Save Triplets ---
    triplets_df = pd.DataFrame(triplets)
    triplets_df.to_csv(output_path, index=False)
    print(f"\nSuccessfully mined and saved {len(triplets)} triplets to {output_path} (from {relevance_path}).")


if __name__ == '__main__':
    CORPUS_PATH = 'data_output/corpus_data.csv'
    NUM_NEGATIVES = 3
    TOP_K = 100

    # 1. Mine Hard Negatives for the TRAINING set
    print("--- Mining Training Triplets (500 Queries) ---")
    mine_hard_negatives(
        relevance_path='data_output/train_relevance.csv', # Input from your 500-query split
        corpus_path=CORPUS_PATH,
        output_path='data_output/train_triplets.csv', # Clean training output file
        num_hard_negatives=NUM_NEGATIVES,
        top_k=TOP_K
    )

    # 2. Mine Hard Negatives for the EVALUATION set
    print("\n--- Mining Evaluation Triplets (200 Queries) ---")
    mine_hard_negatives(
        relevance_path='data_output/eval_relevance.csv', # Input from your 200-query split
        corpus_path=CORPUS_PATH,
        output_path='data_output/eval_triplets.csv', # Dedicated evaluation output file
        num_hard_negatives=NUM_NEGATIVES,
        top_k=TOP_K
    )

--- Mining Training Triplets (500 Queries) ---
BM25 index built for 80380 documents.

Successfully mined and saved 1500 triplets to data_output/train_triplets.csv (from data_output/train_relevance.csv).

--- Mining Evaluation Triplets (200 Queries) ---
BM25 index built for 80380 documents.

Successfully mined and saved 600 triplets to data_output/eval_triplets.csv (from data_output/eval_relevance.csv).


In [1]:
%pip install datasets

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from typing import Dict, Any
from dataclasses import dataclass

# --- Constants (No Change) ---
MODEL_NAME = "sundeepdwd/muril-mlm-newa-finetuned"
MAX_LENGTH = 512
TEMPERATURE = 0.05

# --- TripleCollator (No Change) ---
@dataclass
class TripleCollator:
    tokenizer: AutoTokenizer

    def __call__(self, features):
        return {
            'query': {
                'input_ids': torch.stack([torch.tensor(f['query_input_ids']) for f in features]),
                'attention_mask': torch.stack([torch.tensor(f['query_attention_mask']) for f in features])
            },
            'positive': {
                'input_ids': torch.stack([torch.tensor(f['positive_input_ids']) for f in features]),
                'attention_mask': torch.stack([torch.tensor(f['positive_attention_mask']) for f in features])
            },
            'negatives': {
                'input_ids': torch.stack([torch.tensor(f['negative_input_ids']) for f in features]),
                'attention_mask': torch.stack([torch.tensor(f['negative_attention_mask']) for f in features])
            }
        }

# --- BiEncoderTrainer (Modified) ---
class BiEncoderTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Forward pass and loss calculation logic remains the same
        q_out = model(**inputs['query'])
        p_out = model(**inputs['positive'])
        n_out = model(**inputs['negatives'])

        q_emb = F.normalize(q_out.last_hidden_state[:, 0, :], p=2, dim=1)
        p_emb = F.normalize(p_out.last_hidden_state[:, 0, :], p=2, dim=1)
        n_emb = F.normalize(n_out.last_hidden_state[:, 0, :], p=2, dim=1)

        # Handle the view for InfoNCE contrastive batch setup
        if len(n_emb.shape) == 3:
            B, num_neg, D = n_emb.shape
            n_emb = n_emb.view(B * num_neg, D)

        doc_embs = torch.cat([p_emb, n_emb], dim=0)
        scores = torch.matmul(q_emb, doc_embs.T) / TEMPERATURE
        target = torch.arange(q_emb.size(0), device=q_emb.device)

        loss = F.cross_entropy(scores, target)
        return (loss, None) if return_outputs else loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        # This method is called during evaluation and prediction.
        # We need to ensure the model is in evaluation mode.
        model.eval()

        with torch.no_grad():
            # Calculate embeddings as done in compute_loss
            q_out = model(**inputs['query'])
            p_out = model(**inputs['positive'])
            n_out = model(**inputs['negatives'])

            q_emb = F.normalize(q_out.last_hidden_state[:, 0, :], p=2, dim=1)
            p_emb = F.normalize(p_out.last_hidden_state[:, 0, :], p=2, dim=1)
            n_emb = F.normalize(n_out.last_hidden_state[:, 0, :], p=2, dim=1)

            if len(n_emb.shape) == 3:
                B, num_neg, D = n_emb.shape
                n_emb = n_emb.view(B * num_neg, D)

            # Calculate the loss for evaluation
            doc_embs = torch.cat([p_emb, n_emb], dim=0)
            scores = torch.matmul(q_emb, doc_embs.T) / TEMPERATURE
            target = torch.arange(q_emb.size(0), device=q_emb.device)
            loss = F.cross_entropy(scores, target)

        # For prediction_step, we must return (loss, logits, labels).
        # Since we are primarily interested in the loss for evaluation, and
        # there are no standard 'logits' or 'labels' in this contrastive setup
        # that directly map to the Trainer's expectations for classification/regression,
        # we return None for logits and labels.
        return (loss, None, None)

# --- Main Training Function (Modified) ---
def train_muril_biencoder(train_triplets_path: str, eval_triplets_path: str):
    # --- 1. Load Data ---
    train_df = pd.read_csv(train_triplets_path)
    eval_df = pd.read_csv(eval_triplets_path) # Load the evaluation triplets

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME)

    raw_train_dataset = Dataset.from_pandas(train_df)
    raw_eval_dataset = Dataset.from_pandas(eval_df) # Create the raw eval dataset

    def tokenize_function(examples: Dict[str, Any]):
        # Tokenizer logic remains the same
        query_enc = tokenizer(examples['query'], max_length=MAX_LENGTH, truncation=True, padding='max_length')
        positive_enc = tokenizer(examples['positive_text'], max_length=MAX_LENGTH, truncation=True, padding='max_length')
        negative_enc = tokenizer(examples['negative_text'], max_length=MAX_LENGTH, truncation=True, padding='max_length')

        return {
            'query_input_ids': query_enc['input_ids'], 'query_attention_mask': query_enc['attention_mask'],
            'positive_input_ids': positive_enc['input_ids'], 'positive_attention_mask': positive_enc['attention_mask'],
            'negative_input_ids': negative_enc['input_ids'], 'negative_attention_mask': negative_enc['attention_mask']
        }

    # --- 2. Tokenize Both Datasets ---
    tokenized_train_dataset = raw_train_dataset.map(tokenize_function, batched=True, remove_columns=train_df.columns.tolist())
    tokenized_eval_dataset = raw_eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_df.columns.tolist()) # Tokenize eval

    # --- 3. Update Training Arguments ---
    training_args = TrainingArguments(
        output_dir="./muril_dr_checkpoints",
        num_train_epochs=5,
        per_device_train_batch_size=8,
        learning_rate=2e-5,
        warmup_steps=100,
        weight_decay=0.01,
        logging_strategy='epoch',
        logging_dir='./logs',
        save_total_limit=1,
        fp16=torch.cuda.is_available(),
        remove_unused_columns=False,
        gradient_accumulation_steps=8,

        # Enable evaluation and saving strategy
        eval_strategy="epoch", # Calculate loss on eval_dataset at the end of every epoch
        save_strategy="epoch",       # Save a checkpoint at the end of every epoch
        load_best_model_at_end=True, # Load the model with the lowest val_loss
        metric_for_best_model='loss',
        greater_is_better=False
    )

    # --- 4. Pass Both Datasets to Trainer ---
    trainer = BiEncoderTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset, # Pass the evaluation dataset here!
        data_collator=TripleCollator(tokenizer)
    )

    print("Starting MuRIL fine-tuning with Evaluation Monitoring...")
    trainer.train()

    final_model_path = "./final_muril_encoder"
    trainer.save_model(final_model_path)
    tokenizer.save_pretrained(final_model_path)
    print(f"\nFinal MuRIL encoder saved to {final_model_path}")

if __name__ == '__main__':
    # Use the new, separate triplet files created in the previous steps
    train_muril_biencoder(
        train_triplets_path='data_output/train_triplets.csv',
        eval_triplets_path='data_output/eval_triplets.csv'
    )

Some weights of BertModel were not initialized from the model checkpoint at sundeepdwd/muril-mlm-newa-finetuned and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Starting MuRIL fine-tuning with Evaluation Monitoring...


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss
1,2.7419,2.710866
2,2.4491,1.990322
3,1.2833,1.232971
4,0.5189,1.156662
5,0.2995,1.158696



Final MuRIL encoder saved to ./final_muril_encoder


In [None]:
%pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
import torch
import numpy as np
import faiss
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModel
from typing import List, Dict

# --- Configuration ---
FINAL_MODEL_PATH = "./final_muril_encoder"
MAX_LENGTH = 512
TOP_K_RETRIEVAL = 10 # Retrieve top 10 documents for comprehensive evaluation

# --- Metric Helper Functions ---

def calculate_ap(retrieved_indices: np.ndarray, gt_index: int, k: int) -> float:
    """Calculates Average Precision (AP) for a single query."""
    retrieved_indices_k = retrieved_indices[:k]

    # We assume a single relevant document (gt_index).
    if gt_index not in retrieved_indices_k:
        return 0.0

    relevant_found = 0
    precision_sum = 0.0

    for i, retrieved_index in enumerate(retrieved_indices_k):
        # Position is i + 1
        if retrieved_index == gt_index:
            relevant_found += 1
            # Precision at this rank: (True Positives found so far) / (Total items retrieved so far)
            precision_sum += relevant_found / (i + 1)

    # Since there is only one relevant document (N=1), AP is simply the sum of precisions
    # at relevant ranks divided by the total number of relevant documents (1).
    # If the system supports multiple relevant docs, you'd divide by the true number of relevant docs.
    # Here, for a single relevant doc, AP is equal to the reciprocal rank.
    rank = np.where(retrieved_indices_k == gt_index)[0][0] + 1
    return 1.0 / rank


def calculate_ndcg(retrieved_indices: np.ndarray, gt_index: int, k: int) -> float:
    """Calculates Normalized Discounted Cumulative Gain (nDCG) for a single query."""
    # Relevance is binary: 1 for the ground truth, 0 otherwise.
    # We use the standard DCG formula: rel_i / log2(i+1)

    retrieved_indices_k = retrieved_indices[:k]

    # Create relevance scores (rel_i) for the retrieved list
    relevance_scores = np.array([1.0 if idx == gt_index else 0.0 for idx in retrieved_indices_k])

    # 1. Discounted Cumulative Gain (DCG)
    # The discount factor: log2(i+1) where i starts at 1 (rank 1 is i=1)
    discounts = np.log2(np.arange(2, len(relevance_scores) + 2)) # Ranks 1, 2, 3... -> log2(2), log2(3), log2(4)...

    dcg = relevance_scores[0] + np.sum(relevance_scores[1:] / discounts[:-1])

    # 2. Ideal Discounted Cumulative Gain (IDCG)
    # The ideal list is [1, 0, 0, ...] as we only have one relevant document.
    ideal_scores = np.zeros_like(relevance_scores)

    # If the ground truth exists in the top K, the ideal list starts with 1
    if gt_index in retrieved_indices_k:
        ideal_scores[0] = 1.0
        # IDCG calculation uses the same discount factors
        idcg = ideal_scores[0] + np.sum(ideal_scores[1:] / discounts[:-1])
    else:
        # If the relevant doc is outside k, IDCG is still 1.0 / log2(1+1) = 1.0
        # but DCG is 0.0, so NDCG = 0.0
        idcg = 1.0 # The maximum DCG is 1.0 for the top rank (1/log2(2))

    # Handle division by zero edge case (though IDCG should be at least 1.0 if k>=1 and one relevant exists)
    if idcg == 0.0:
        return 0.0

    return dcg / idcg

# --- Utility Functions ---
def encode_texts(texts: List[str], model: AutoModel, tokenizer: AutoTokenizer) -> np.ndarray:
    """Encodes a list of texts into dense vectors."""
    model.eval()
    embeddings = []
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, return_tensors='pt', max_length=MAX_LENGTH, truncation=True, padding='max_length')
            # Move inputs to the device
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # CLS token embedding
            output = model(**inputs).last_hidden_state[:, 0, :].squeeze()
            embeddings.append(output.cpu().numpy())

    return np.array(embeddings).astype('float32')

# --- Main Evaluation Function ---
def evaluate_retrieval(test_data: pd.DataFrame, index: faiss.Index, corpus_id_map: Dict[str, int], model: AutoModel, tokenizer: AutoTokenizer) -> Dict[str, float]:
    """Evaluates the trained model using multiple ranking metrics."""

    mrr_total = 0.0
    map_total = 0.0
    ndcg_total = 0.0
    recall_at_k_hits = 0
    top_1_hits = 0
    top_5_hits = 0

    num_queries = len(test_data)

    for _, row in test_data.iterrows():
        query = row['Query']
        gt_doc_id = row['relevant docs'] # The known correct answer

        # 1. Encode the Query
        query_embedding = encode_texts([query], model, tokenizer)

        # 2. Search the FAISS index
        D, I = index.search(query_embedding, TOP_K_RETRIEVAL)

        # 3. Evaluate Results
        retrieved_indices = I[0]
        gt_index = corpus_id_map.get(gt_doc_id)

        # Check if ground truth document exists in the corpus and proceed
        if gt_index is None:
            continue

        # --- Base Relevance Check ---
        is_relevant = gt_index in retrieved_indices

        if is_relevant:
            # Rank starts at 1
            rank = np.where(retrieved_indices == gt_index)[0][0] + 1

            # --- 1. Mean Reciprocal Rank (MRR) ---
            mrr_total += 1.0 / rank

            # --- 2. Recall@K ---
            recall_at_k_hits += 1

            # --- 3. Mean Average Precision (MAP) ---
            # For a single relevant document, AP is simply the reciprocal rank.
            map_total += calculate_ap(retrieved_indices, gt_index, TOP_K_RETRIEVAL)

            # --- 4. Normalized Discounted Cumulative Gain (nDCG@K) ---
            ndcg_total += calculate_ndcg(retrieved_indices, gt_index, TOP_K_RETRIEVAL)

            # --- 5. Top-1 and Top-5 Accuracy ---
            if rank == 1:
                top_1_hits += 1
            if rank <= 5:
                top_5_hits += 1


    # --- Aggregate and Finalize Metrics ---
    results = {
        "MRR": mrr_total / num_queries,
        "MAP": map_total / num_queries,
        f"NDCG@{TOP_K_RETRIEVAL}": ndcg_total / num_queries,
        f"Recall@{TOP_K_RETRIEVAL}": recall_at_k_hits / num_queries,
        "Accuracy@1": top_1_hits / num_queries,
        "Accuracy@5": top_5_hits / num_queries,
    }

    return results


def run_indexing_and_eval():
    # --- 1. Load Resources ---
    tokenizer = AutoTokenizer.from_pretrained(FINAL_MODEL_PATH)
    model = AutoModel.from_pretrained(FINAL_MODEL_PATH)

    corpus_df = pd.read_csv('data_output/corpus_data.csv')
    test_relevance = pd.read_csv('data_output/test_relevance.csv')

    # Create necessary mappings
    corpus_texts = corpus_df['text'].tolist()
    doc_ids = corpus_df['DocID'].tolist()
    doc_id_to_index = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # --- 2. Encode Corpus ---
    print(f"Encoding {len(corpus_texts)} documents...")
    corpus_embeddings = encode_texts(corpus_texts, model, tokenizer)

    # --- 3. Build FAISS Index ---
    D = corpus_embeddings.shape[1] # Embedding dimension (768 for MuRIL-base)

    # Using IndexFlatL2 (Exact Search)
    index = faiss.IndexFlatL2(D)
    index.add(corpus_embeddings)

    faiss.write_index(index, "data_output/final_faiss_index.bin")
    print(f"FAISS Index built and saved with {index.ntotal} vectors.")

    # --- 4. Run Evaluation ---
    print("\n--- Running Evaluation on Test Set ---")
    results = evaluate_retrieval(test_relevance, index, doc_id_to_index, model, tokenizer)

    print("\n✅ Final Retrieval Performance:")
    print("---------------------------------")
    for metric, value in results.items():
        print(f"{metric:<15}: {value:.4f}")
    print("---------------------------------")

if __name__ == '__main__':
    run_indexing_and_eval()

Encoding 80380 documents...
