<a href="https://colab.research.google.com/github/rititripathi09/MIRaCLE/blob/main/CLMIR(2025)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

RUN-01

In [None]:
!pip install pandas numpy sentence-transformers sympy faiss-cpu
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import sympy as sp
import re
import warnings
import torch
import faiss
import os
from multiprocessing import Pool
import time
print("Pandas imported successfully, version:", pd.__version__)
warnings.filterwarnings('ignore')
from google.colab import drive
try:
    drive.mount('/content/drive')
except Exception as e:
    raise RuntimeError(f"Failed to mount Google Drive: {e}")

#MODEL INITALIZING IS DONE OVER HERE
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device='cuda' if torch.cuda.is_available() else 'cpu')
expected_dim = model.get_sentence_embedding_dimension()
print(f"Model expected dimension: {expected_dim}")
print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
train_data_path = '/content/drive/MyDrive/clmir/indexed_documents.csv'
test_data_path = '/content/drive/MyDrive/clmir/indexed_test_data.csv'
train_embeddings_path = '/content/drive/MyDrive/clmir/document_embeddings.csv'
output_path = '/content/drive/MyDrive/clmir/retrieval_results.csv'

# DEBUUGING OF THE FILE PATHS
print(f"Train data path: {train_data_path}, Type: {type(train_data_path)}")
print(f"Test data path: {test_data_path}, Type: {type(test_data_path)}")
print(f"Train embeddings path: {train_embeddings_path}, Type: {type(train_embeddings_path)}")
print(f"Output path: {output_path}, Type: {type(output_path)}")

# VALIDATION OF THE FILE PATH TYPES- CORRECT OR NOT- IF NOT GIVE ERROR
if not isinstance(train_data_path, (str, bytes, os.PathLike)):
    raise TypeError(f"train_data_path must be a string or PathLike, got {type(train_data_path)}")
if not isinstance(test_data_path, (str, bytes, os.PathLike)):
    raise TypeError(f"test_data_path must be a string or PathLike, got {type(test_data_path)}")

# CHECXKS IF I/O FILE IS THER/CORRECT OR NOT
if not os.path.exists(train_data_path):
    raise FileNotFoundError(f"Training file not found at {train_data_path}. Ensure the file exists and Google Drive is mounted.")
if not os.path.exists(test_data_path):
    raise FileNotFoundError(f"Test file not found at {test_data_path}. Ensure the file exists and Google Drive is mounted.")

# DATA LOADING AND PRINING COLUMN ADN THERE SIZES FOR DEBUGGING AND CHECK COLUMN AND TEST SET SIZE
start_time = time.time()
try:
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)
except Exception as e:
    raise RuntimeError(f"Error loading CSV files: {e}")
print(f"load_data took {time.time() - start_time:.2f} seconds")
print("Training data columns:", train_df.columns.tolist())
print("Test data columns:", test_df.columns.tolist())
print(f"Training data size: {len(train_df)} documents")
print(f"Test data size: {len(test_df)} queries")
if 'docno' not in train_df.columns or 'text' not in train_df.columns:
    raise KeyError("Training data must contain 'docno' and 'text' columns")
if 'qid' not in test_df.columns or 'query' not in test_df.columns:
    raise KeyError("Test data must contain 'qid' and 'query' columns")
if len(test_df) != 50:
    raise ValueError(f"Test set must contain exactly 50 queries, got {len(test_df)}")

# CLEAING OF LATEX AS MATHEMATICAL EXPRESSIONS OF THERE ANY
def clean_latex(expr):
    expr = re.sub(r'\\[a-zA-Z]+', '', expr)
    expr = expr.replace('{', '').replace('}', '')
    expr = expr.replace('^', '**')
    expr = re.sub(r'\s+', '', expr)
    return expr

# EXTRACTION OF TEXT AND MATHEMATICAL EXPRESSION
def extract_text_math(content):
    math_pattern = r'\$.*?\$|[\w\^*/+\-()=]+|\sum.*?\)|integral.*?\)|partial.*?\)|limit.*?\)'
    math_expressions = [clean_latex(expr) for expr in re.findall(math_pattern, str(content))][:1]  # Limit to top 1 expression
    clean_text = re.sub(math_pattern, '', str(content)).strip()
    return clean_text, math_expressions

# Csymp lib used. math similarity-text to mathematical expression
math_cache = {}
def cached_sympify(expr):
    if expr not in math_cache:
        try:
            math_cache[expr] = sp.sympify(expr, evaluate=False)
        except:
            math_cache[expr] = None
    return math_cache[expr]

# compute math expression similarity
def math_similarity(expr1, expr2):
    try:
        parsed_expr1 = cached_sympify(expr1)
        parsed_expr2 = cached_sympify(expr2)
        if parsed_expr1 is None or parsed_expr2 is None:
            return 0.0
        if str(parsed_expr1) == str(parsed_expr2):
            return 1.0
        if parsed_expr1 == parsed_expr2:
            return 1.0
        diff = sp.simplify(parsed_expr1 - parsed_expr2)
        if diff == 0:
            return 1.0
        return 0.5
    except:
        return 0.0

# Preprocessing of the data
start_time = time.time()
train_df['clean_text'], train_df['math_expressions'] = zip(*train_df['text'].apply(extract_text_math))
test_df['clean_text'], test_df['math_expressions'] = zip(*test_df['query'].apply(extract_text_math))
train_df['clean_text'] = train_df['clean_text'].replace('', ' ').fillna(' ')
test_df['clean_text'] = test_df['clean_text'].replace('', ' ').fillna(' ')
print(f"preprocess_data took {time.time() - start_time:.2f} seconds")

# Load or compute train embeddings, either use the exisited or generate it
start_time = time.time()
batch_size = 128 if torch.cuda.is_available() else 64
if os.path.exists(train_embeddings_path):
    print("Loading precomputed train embeddings...")
    embeddings_df = pd.read_csv(train_embeddings_path)
    print(f"Precomputed embeddings CSV shape: {embeddings_df.shape}")
    if embeddings_df.shape[1] != expected_dim:
        print(f"Warning: Precomputed embeddings dimension {embeddings_df.shape[1]} does not match model dimension {expected_dim}. Recomputing embeddings...")
        train_text_embeddings = model.encode(train_df['clean_text'].tolist(), batch_size=batch_size, convert_to_tensor=True, show_progress_bar=True)
        pd.DataFrame(train_text_embeddings.cpu().numpy()).to_csv(train_embeddings_path, index=False)
        print(f"Saved new train embeddings to {train_embeddings_path}, shape: {train_text_embeddings.shape}")
    else:
        train_text_embeddings = torch.tensor(embeddings_df.values, dtype=torch.float32).to(model.device)
        print(f"Loaded train embeddings shape: {train_text_embeddings.shape}")
else:
    print("Computing train embeddings...")
    train_text_embeddings = model.encode(train_df['clean_text'].tolist(), batch_size=batch_size, convert_to_tensor=True, show_progress_bar=True)
    pd.DataFrame(train_text_embeddings.cpu().numpy()).to_csv(train_embeddings_path, index=False)
    print(f"Saved new train embeddings to {train_embeddings_path}, shape: {train_text_embeddings.shape}")

test_text_embeddings = model.encode(test_df['clean_text'].tolist(), batch_size=batch_size, convert_to_tensor=True, show_progress_bar=True)
print(f"Test embeddings shape: {test_text_embeddings.shape}")
print(f"load_or_compute_embeddings took {time.time() - start_time:.2f} seconds")

# dimensionality verification for the embeddings dimensions
if train_text_embeddings.shape[1] != expected_dim:
    raise ValueError(f"Train embeddings dimension {train_text_embeddings.shape[1]} does not match model dimension {expected_dim}")
if test_text_embeddings.shape[1] != expected_dim:
    raise ValueError(f"Test embeddings dimension {test_text_embeddings.shape[1]} does not match model dimension {expected_dim}")

# Building FAISS index with IVF optimization
start_time = time.time()
dimension = train_text_embeddings.shape[1]
nlist = 100
quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_INNER_PRODUCT)
train_embeddings_np = np.ascontiguousarray(train_text_embeddings.cpu().numpy(), dtype=np.float32)
faiss.normalize_L2(train_embeddings_np)
index.train(train_embeddings_np)
index.add(train_embeddings_np)
index.nprobe = 10
print(f"build_faiss_index took {time.time() - start_time:.2f} seconds")

# compute hybrid similarity for a single query
def compute_query_similarities(args):
    start_time = time.time()
    i, query_emb, query_math, train_embeddings, train_math, docnos = args
    query_emb_np = np.ascontiguousarray(query_emb.cpu().numpy(), dtype=np.float32).reshape(1, -1)
    if query_emb_np.shape[1] != train_embeddings.shape[1]:
        raise ValueError(f"Query embedding dimension {query_emb_np.shape[1]} does not match index dimension {train_embeddings.shape[1]}")
    faiss.normalize_L2(query_emb_np)
    k = 500
    distances, indices = index.search(query_emb_np, k)
    similarities = []
    for j, idx in enumerate(indices[0]):
        if idx >= len(docnos):
            continue
        doc_emb = train_embeddings[idx]
        doc_math = train_math[idx]
        text_sim = distances[0][j]
        math_sim = 0.0
        if query_math and doc_math:
            math_scores = [math_similarity(qm, dm) for qm in query_math[:1] for dm in doc_math[:1]]
            math_sim = max(math_scores) if math_scores else 0.0
        sim_score = 0.7 * text_sim + 0.3 * math_sim
        similarities.append((docnos[idx], sim_score))
    similarities.sort(key=lambda x: x[1], reverse=True)
    print(f"compute_query_similarities for query {i} took {time.time() - start_time:.2f} seconds")
    return i, similarities[:50]
# Parallel similarity computation
start_time = time.time()
try:
    pool = Pool()
    args = [(i, query_emb, test_df['math_expressions'][i], train_text_embeddings, train_df['math_expressions'], train_df['docno'])
            for i, query_emb in enumerate(test_text_embeddings)]
    results_parallel = pool.map(compute_query_similarities, args)
finally:
    pool.close()
    pool.join()
print(f"compute_similarities_parallel took {time.time() - start_time:.2f} seconds")
#generate, collexct and save results
start_time = time.time()
results = []
for i, similarities in sorted(results_parallel, key=lambda x: x[0]):
    query_id = test_df['qid'][i]
    for docno, sim_score in similarities:
        results.append({
            'query_ID': query_id,
            'retrieved_body_ID': docno,
            'Run No.': 1,
            'Similarity Score': sim_score
        })
print(f"collect_results took {time.time() - start_time:.2f} seconds")
try:
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}")
    print(f"Output shape: {results_df.shape}")
except Exception as e:
    raise RuntimeError(f"Error saving results to {output_path}: {e}")

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
Pandas imported successfully, version: 2.2.2


RuntimeError: Failed to mount Google Drive: mount failed

RUN-02

In [None]:
# Install all required dependencies
!pip install pandas numpy sentence-transformers sympy faiss-cpu spacy
!python -m spacy download en_core_web_sm

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import sympy as sp
import re
import warnings
import torch
import faiss
import os
from multiprocessing import Pool
import time
import spacy
from torch.cuda.amp import autocast

# Suppress warnings
warnings.filterwarnings('ignore')
print("Pandas imported successfully, version:", pd.__version__)

# Mount Google Drive
from google.colab import drive
try:
    drive.mount('/content/drive')
except Exception as e:
    raise RuntimeError(f"Failed to mount Google Drive: {e}")

# MODEL INITIALIZATION
try:
    model = SentenceTransformer('all-mpnet-base-v2', device='cuda' if torch.cuda.is_available() else 'cpu')
except Exception as e:
    raise RuntimeError(f"Failed to initialize SentenceTransformer: {e}")
expected_dim = model.get_sentence_embedding_dimension()
print(f"Model expected dimension: {expected_dim}")
print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
nlp = spacy.load("en_core_web_sm")

# File paths
train_data_path = '/content/drive/MyDrive/clmir/indexed_documents.csv'
test_data_path = '/content/drive/MyDrive/clmir/indexed_test_data.csv'
train_embeddings_path = '/content/drive/MyDrive/clmir/train_embeddings.csv'
output_path = '/content/drive/MyDrive/clmir/run02.csv'
visualization_path = '/content/drive/MyDrive/clmir/similarity_visualization.csv'

# DEBUGGING OF FILE PATHS
print(f"Train data path: {train_data_path}, Type: {type(train_data_path)}")
print(f"Test data path: {test_data_path}, Type: {type(test_data_path)}")
print(f"Train embeddings path: {train_embeddings_path}, Type: {type(train_embeddings_path)}")
print(f"Output path: {output_path}, Type: {type(output_path)}")

# VALIDATION OF FILE PATH TYPES
if not isinstance(train_data_path, (str, bytes, os.PathLike)):
    raise TypeError(f"train_data_path must be a string or PathLike, got {type(train_data_path)}")
if not isinstance(test_data_path, (str, bytes, os.PathLike)):
    raise TypeError(f"test_data_path must be a string or PathLike, got {type(test_data_path)}")
if not isinstance(train_embeddings_path, (str, bytes, os.PathLike)):
    raise TypeError(f"train_embeddings_path must be a string or PathLike, got {type(train_embeddings_path)}")

# CHECKS FOR FILE EXISTENCE
if not os.path.exists(train_data_path):
    raise FileNotFoundError(f"Training file not found at {train_data_path}. Ensure the file exists and Google Drive is mounted.")
if not os.path.exists(test_data_path):
    raise FileNotFoundError(f"Test file not found at {test_data_path}. Ensure the file exists and Google Drive is mounted.")

# DATA LOADING
start_time = time.time()
try:
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)
except Exception as e:
    raise RuntimeError(f"Error loading CSV files: {e}")
print(f"load_data took {time.time() - start_time:.2f} seconds")
print("Training data columns:", train_df.columns.tolist())
print("Test data columns:", test_df.columns.tolist())
print(f"Training data size: {len(train_df)} documents")
print(f"Test data size: {len(test_df)} queries")

# VALIDATE DATASET COLUMNS AND SIZE
if 'docno' not in train_df.columns or 'text' not in train_df.columns:
    raise KeyError("Training data must contain 'docno' and 'text' columns")
if 'qid' not in test_df.columns or 'query' not in test_df.columns:
    raise KeyError("Test data must contain 'qid' and 'query' columns")
if len(test_df) != 50:
    raise ValueError(f"Test set must contain exactly 50 queries, got {len(test_df)}")
if len(train_df) == 0:
    raise ValueError("Training dataset is empty")
if len(test_df) == 0:
    raise ValueError("Test dataset is empty")

# CLEANING LATEX EXPRESSIONS
def clean_latex(expr):
    expr = re.sub(r'\\[a-zA-Z]+', '', expr)
    expr = expr.replace('{', '').replace('}', '')
    expr = expr.replace('^', '**')
    expr = re.sub(r'\s+', '', expr)
    return expr

# ENHANCED TEXT AND MATH EXTRACTION WITH SPACY
def extract_text_math(content):
    content = str(content) if content is not None else ''
    doc = nlp(content)
    clean_text = ' '.join(token.text for token in doc if not token.text.startswith('$'))
    math_pattern = r'\$.*?\$|[\w\^*/+\-()=]+|\sum.*?\)|integral.*?\)|partial.*?\)|limit.*?\)'
    math_expressions = [clean_latex(expr) for expr in re.findall(math_pattern, content)][:1]
    return clean_text or ' ', math_expressions

# PREPROCESSING DATA
start_time = time.time()
try:
    train_df['clean_text'], train_df['math_expressions'] = zip(*train_df['text'].apply(extract_text_math))
    test_df['clean_text'], test_df['math_expressions'] = zip(*test_df['query'].apply(extract_text_math))
    train_df['clean_text'] = train_df['clean_text'].replace('', ' ').fillna(' ')
    test_df['clean_text'] = test_df['clean_text'].replace('', ' ').fillna(' ')
except Exception as e:
    raise RuntimeError(f"Error preprocessing data: {e}")
print(f"preprocess_data took {time.time() - start_time:.2f} seconds")

# CACHED SYMPIFY WITH ENHANCED ERROR HANDLING
math_cache = {}
def cached_sympify(expr):
    if not expr:
        return None
    if expr not in math_cache:
        try:
            parsed = sp.sympify(expr, evaluate=False)
            simplified = sp.simplify(parsed)
            math_cache[expr] = simplified
        except:
            math_cache[expr] = None
    return math_cache[expr]

# ENHANCED MATH SIMILARITY
def math_similarity(expr1, expr2):
    try:
        parsed_expr1 = cached_sympify(expr1)
        parsed_expr2 = cached_sympify(expr2)
        if parsed_expr1 is None or parsed_expr2 is None:
            return 0.0
        if str(parsed_expr1) == str(parsed_expr2):
            return 1.0
        diff = sp.simplify(parsed_expr1 - parsed_expr2)
        if diff == 0:
            return 1.0
        try:
            x = sp.Symbol('x')
            eval_diff = abs(float(parsed_expr1.subs(x, 1) - parsed_expr2.subs(x, 1)))
            return max(0.0, 1.0 - eval_diff / (abs(float(parsed_expr1.subs(x, 1))) + 1e-10))
        except:
            return 0.5
    except:
        return 0.0

# LOAD OR COMPUTE EMBEDDINGS WITH MIXED PRECISION
start_time = time.time()
batch_size = 128 if torch.cuda.is_available() else 64  # Reduced batch size to avoid OOM
if os.path.exists(train_embeddings_path):
    print("Loading precomputed train embeddings...")
    try:
        embeddings_df = pd.read_csv(train_embeddings_path)
        print(f"Precomputed embeddings CSV shape: {embeddings_df.shape}")
        if embeddings_df.shape[1] != expected_dim:
            print(f"Warning: Precomputed embeddings dimension {embeddings_df.shape[1]} does not match model dimension {expected_dim}. Recomputing embeddings...")
            with autocast():
                train_text_embeddings = model.encode(train_df['clean_text'].tolist(), batch_size=batch_size, convert_to_tensor=True, show_progress_bar=True)
            pd.DataFrame(train_text_embeddings.cpu().numpy()).to_csv(train_embeddings_path, index=False)
            print(f"Saved new train embeddings to {train_embeddings_path}, shape: {train_text_embeddings.shape}")
        else:
            train_text_embeddings = torch.tensor(embeddings_df.values, dtype=torch.float32).to(model.device)
            print(f"Loaded train embeddings shape: {train_text_embeddings.shape}")
    except Exception as e:
        print(f"Error loading embeddings: {e}. Recomputing embeddings...")
        with autocast():
            train_text_embeddings = model.encode(train_df['clean_text'].tolist(), batch_size=batch_size, convert_to_tensor=True, show_progress_bar=True)
        pd.DataFrame(train_text_embeddings.cpu().numpy()).to_csv(train_embeddings_path, index=False)
        print(f"Saved new train embeddings to {train_embeddings_path}, shape: {train_text_embeddings.shape}")
else:
    print("Computing train embeddings...")
    with autocast():
        train_text_embeddings = model.encode(train_df['clean_text'].tolist(), batch_size=batch_size, convert_to_tensor=True, show_progress_bar=True)
    pd.DataFrame(train_text_embeddings.cpu().numpy()).to_csv(train_embeddings_path, index=False)
    print(f"Saved new train embeddings to {train_embeddings_path}, shape: {train_text_embeddings.shape}")

try:
    with autocast():
        test_text_embeddings = model.encode(test_df['clean_text'].tolist(), batch_size=batch_size, convert_to_tensor=True, show_progress_bar=True)
except Exception as e:
    raise RuntimeError(f"Error computing test embeddings: {e}")
print(f"Test embeddings shape: {test_text_embeddings.shape}")
print(f"load_or_compute_embeddings took {time.time() - start_time:.2f} seconds")

# DIMENSIONALITY VERIFICATION
if train_text_embeddings.shape[1] != expected_dim:
    raise ValueError(f"Train embeddings dimension {train_text_embeddings.shape[1]} does not match model dimension {expected_dim}")
if test_text_embeddings.shape[1] != expected_dim:
    raise ValueError(f"Test embeddings dimension {test_text_embeddings.shape[1]} does not match model dimension {expected_dim}")

# BUILDING FAISS INDEX
start_time = time.time()
dimension = train_text_embeddings.shape[1]
nlist = min(200, max(1, len(train_df) // 10))  # Ensure nlist is at least 1
quantizer = faiss.IndexFlatIP(dimension)
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_INNER_PRODUCT)
train_embeddings_np = np.ascontiguousarray(train_text_embeddings.cpu().numpy(), dtype=np.float32)
faiss.normalize_L2(train_embeddings_np)
try:
    index.train(train_embeddings_np)
    index.add(train_embeddings_np)
except Exception as e:
    raise RuntimeError(f"Error building FAISS index: {e}")
index.nprobe = min(20, nlist)  # Dynamic nprobe
print(f"build_faiss_index took {time.time() - start_time:.2f} seconds")

# COMPUTE HYBRID SIMILARITY FOR A SINGLE QUERY
def compute_query_similarities(args):
    start_time = time.time()
    i, query_emb, query_math, train_embeddings, train_math, docnos = args
    query_emb_np = np.ascontiguousarray(query_emb.cpu().numpy(), dtype=np.float32).reshape(1, -1)
    if query_emb_np.shape[1] != train_embeddings.shape[1]:
        raise ValueError(f"Query embedding dimension {query_emb_np.shape[1]} does not match index dimension {train_embeddings.shape[1]}")
    faiss.normalize_L2(query_emb_np)
    k = 500
    try:
        distances, indices = index.search(query_emb_np, k)
    except Exception as e:
        raise RuntimeError(f"Error during FAISS search for query {i}: {e}")
    similarities = []
    for j, idx in enumerate(indices[0]):
        if idx >= len(docnos):
            continue
        doc_emb = train_embeddings[idx]
        doc_math = train_math[idx]
        text_sim = distances[0][j]
        math_sim = 0.0
        if query_math and doc_math:
            math_scores = [math_similarity(qm, dm) for qm in query_math[:1] for dm in doc_math[:1]]
            math_sim = max(math_scores) if math_scores else 0.0
        sim_score = 0.6 * text_sim + 0.4 * math_sim  # Adjusted weights
        similarities.append((docnos[idx], sim_score, text_sim, math_sim))
    similarities.sort(key=lambda x: x[1], reverse=True)
    print(f"compute_query_similarities for query {i} took {time.time() - start_time:.2f} seconds")
    return i, similarities[:50]

# PARALLEL SIMILARITY COMPUTATION
start_time = time.time()
try:
    pool = Pool()
    args = [(i, query_emb, test_df['math_expressions'][i], train_text_embeddings, train_df['math_expressions'], train_df['docno'])
            for i, query_emb in enumerate(test_text_embeddings)]
    results_parallel = pool.map(compute_query_similarities, args)
except Exception as e:
    raise RuntimeError(f"Error in parallel similarity computation: {e}")
finally:
    pool.close()
    pool.join()
print(f"compute_similarities_parallel took {time.time() - start_time:.2f} seconds")

# COLLECT AND SAVE RESULTS
start_time = time.time()
results = []
visualization_data = []
for i, similarities in sorted(results_parallel, key=lambda x: x[0]):
    query_id = test_df['qid'][i]
    for rank, (docno, sim_score, text_sim, math_sim) in enumerate(similarities, 1):
        results.append({
            'query_ID': query_id,
            'retrieved_body_ID': docno,
            'Run No.': 2,
            'Similarity Score': sim_score
        })
        visualization_data.append({
            'query_ID': query_id,
            'docno': docno,
            'rank': rank,
            'text_similarity': text_sim,
            'math_similarity': math_sim,
            'combined_similarity': sim_score
        })
print(f"collect_results took {time.time() - start_time:.2f} seconds")
try:
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}, shape: {results_df.shape}")
    vis_df = pd.DataFrame(visualization_data)
    vis_df.to_csv(visualization_path, index=False)
    print(f"Visualization data saved to {visualization_path}, shape: {vis_df.shape}")
except Exception as e:
    raise RuntimeError(f"Error saving results to {output_path}: {e}")