In [1]:
# This script implements a full distillation loop:
# 1. Generate pairs of papers and score them with a base embedding model.
# 2. Have a powerful LLM (Gemini) review and re-score these pairs.
# 3. Fine-tune the base model using the LLM's scores as the ground truth.
# 4. Test the new model to see if its understanding of similarity has improved.

import pandas as pd
import numpy as np
import random
import asyncio
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.util import cos_sim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from google import genai

# --- 1. CONFIGURATION ---
print("--- 1. CONFIGURATION ---")
INPUT_CSV_PATH = "/content/drive/MyDrive/google_colab/arxiv/research_paper_202511302127.csv"
INITIAL_SCORED_PAIRS_CSV = "/content/drive/MyDrive/google_colab/arxiv/initial_scored_pairs.csv"
LLM_VERIFIED_PAIRS_CSV = "/content/drive/MyDrive/google_colab/arxiv/llm_verified_pairs.csv"
FINETUNED_MODEL_PATH = "/content/drive/MyDrive/google_colab/arxiv/finetuned_model_from_analysis"
BASE_MODEL_NAME = 'all-MiniLM-L6-v2'
TOTAL_PAIRS_TO_GENERATE = 10000
LLM_BATCH_SIZE = 10

--- 1. CONFIGURATION ---


In [2]:


try:
    GOOGLE_API_KEY = "AIzaSyC5iHG1QxVM66oNA-DUyEdwPyXr2uWvr4E"
    client = genai.Client(api_key=GOOGLE_API_KEY)
    print("Gemini API configured.")
except Exception as e:
    print(f"Error: Could not configure Gemini API. {e}")
    exit()

# --- 2. LOAD INITIAL DATA ---
def load_data(filepath):
    print(f"\n--- 2. LOADING DATA from {filepath} ---")
    try:
        df = pd.read_csv(filepath)
        df = df[['id', 'title', 'description']].dropna()
        print(f"Successfully loaded {len(df)} papers.")
        return df
    except FileNotFoundError:
        print(f"❌ Error: Input file not found at {filepath}")
        exit()


Gemini API configured.


In [3]:

# --- 3. GENERATE AND SCORE PAIRS ---
def generate_and_classify_pairs(df, model, num_pairs):
    print(f"\n--- 3. GENERATING/LOADING INITIAL PAIRS ---")

    if os.path.exists(INITIAL_SCORED_PAIRS_CSV):
        print(f"Found existing file '{INITIAL_SCORED_PAIRS_CSV}'. Loading pairs from file.")
        return pd.read_csv(INITIAL_SCORED_PAIRS_CSV)

    print(f"Generating {num_pairs} new pairs and scoring with '{BASE_MODEL_NAME}'...")
    all_pairs = []
    seen_paper_ids = set()
    papers_dict = df.set_index('id').to_dict('index')
    paper_ids = df['id'].tolist()

    pbar = tqdm(total=num_pairs, desc="Generating Pairs")
    while len(all_pairs) < num_pairs:
        id1, id2 = random.sample(paper_ids, 2)

        pair_key = tuple(sorted((id1, id2)))
        if pair_key in seen_paper_ids:
            continue

        seen_paper_ids.add(pair_key)

        paper1 = papers_dict[id1]
        paper2 = papers_dict[id2]

        # Generate embeddings
        text1 = paper1['title'] + ": " + paper1['description']
        text2 = paper2['title'] + ": " + paper2['description']
        embeddings = model.encode([text1, text2])

        # Calculate cosine similarity
        score = cos_sim(embeddings[0], embeddings[1]).item()

        all_pairs.append({
            "id1": id1, "title1": paper1['title'], "description1": paper1['description'],
            "id2": id2, "title2": paper2['title'], "description2": paper2['description'],
            "score": score
        })
        pbar.update(1)

    pbar.close()

    pairs_df = pd.DataFrame(all_pairs)
    print(f"Generated and scored {len(pairs_df)} pairs.")
    pairs_df.to_csv(INITIAL_SCORED_PAIRS_CSV, index=False)
    print(f"Initial scores saved to '{INITIAL_SCORED_PAIRS_CSV}'")
    return pairs_df


In [4]:

# --- 4. LLM VERIFICATION ---
async def get_llm_verification(paper1_title, paper1_desc, paper2_title, paper2_desc):
    """Asks the LLM for a similarity score and reasoning."""
    prompt = f"""
    You are a meticulous research assistant. On a scale of 0.0 to 1.0, how relevant is Paper B as a recommendation for someone who just finished reading Paper A?
    After the score, provide a very brief, one-sentence reason for your score.

    Format your response as: [SCORE] | [REASON]
    Example: 0.85 | Both papers focus on convolutional neural networks for image classification.

    Paper A:
    Title: {paper1_title}
    Abstract: {paper1_desc}

    Paper B:
    Title: {paper2_title}
    Abstract: {paper2_desc}
    """
    try:
        # model = client.models('gemini-1.5-flash')
        response = await client.aio.models.generate_content(
            model='gemini-2.5-flash',
            contents=prompt
        )

        parts = response.text.strip().split('|')
        score_text = parts[0].strip()
        reason = parts[1].strip() if len(parts) > 1 else "No reason provided."

        score = float(score_text)
        return max(0.0, min(1.0, score)), reason
    except Exception as e:
        print(e)
        return e, str(e)
    # finally:
    #   await asyncio.sleep(60)


In [5]:

async def verify_with_llm(pairs_df):
    """Orchestrates the LLM verification process in parallel batches."""
    print(f"\n--- 4. VERIFYING {len(pairs_df)} PAIRS WITH LLM ---")

    if os.path.exists(LLM_VERIFIED_PAIRS_CSV):
        print(f"Found existing file '{LLM_VERIFIED_PAIRS_CSV}'. Loading pairs from file.")
        return pd.read_csv(LLM_VERIFIED_PAIRS_CSV)

    verified_results = []

    for i in tqdm(range(0, len(pairs_df), LLM_BATCH_SIZE), desc="Verifying with LLM"):
        batch_df = pairs_df.iloc[i:i + LLM_BATCH_SIZE]

        tasks = [
            get_llm_verification(
                row['title1'], row['description1'],
                row['title2'], row['description2']
            ) for _, row in batch_df.iterrows()
        ]

        results = await asyncio.gather(*tasks)

        for index, (llm_score, llm_reason) in enumerate(results):
            original_row = batch_df.iloc[index].to_dict()
            if isinstance(llm_score, float):
                original_row['llm_score'] = llm_score
                original_row['llm_reason'] = llm_reason
            else:
                original_row['llm_score'] = -1.0 # Indicate error
                original_row['llm_reason'] = llm_reason
            verified_results.append(original_row)

    verified_df = pd.DataFrame(verified_results)
    # Filter out any errored rows before saving
    verified_df = verified_df[verified_df['llm_score'] != -1.0]
    print(f"LLM verification complete. {len(verified_df)} pairs successfully verified.")
    verified_df.to_csv(LLM_VERIFIED_PAIRS_CSV, index=False)
    print(f"LLM-verified scores saved to '{LLM_VERIFIED_PAIRS_CSV}'")
    return verified_df


In [6]:

# --- 5. FINE-TUNE THE MODEL ---
def fine_tune_model(training_df):
    """Fine-tunes the base model using the LLM-verified scores."""
    print(f"\n--- 5. FINE-TUNING MODEL with {len(training_df)} examples ---")

    if os.path.exists(FINETUNED_MODEL_PATH):
        print(f"Found existing file '{FINETUNED_MODEL_PATH}'")
        return
    # Prepare examples for sentence-transformers
    train_examples = []
    for _, row in training_df.iterrows():
        text1 = row['title1'] + ": " + row['description1']
        text2 = row['title2'] + ": " + row['description2']
        # Use the LLM's score as the new "ground truth" label
        train_examples.append(InputExample(texts=[text1, text2], label=float(row['llm_score'])))

    # Define model, loss, and dataloader
    model = SentenceTransformer(BASE_MODEL_NAME)
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
    train_loss = losses.CosineSimilarityLoss(model=model)

    print("Starting fine-tuning...")
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=4, # 1 epoch is often enough for fine-tuning
        warmup_steps=100,
        output_path=FINETUNED_MODEL_PATH,
        show_progress_bar=True
    )
    print(f"Fine-tuning complete. New model saved to '{FINETUNED_MODEL_PATH}'")


In [7]:

# --- 6. TEST FOR IMPROVEMENT ---
def test_improvement(original_model, finetuned_model_path):
    """Compares the original and fine-tuned models on test pairs."""
    print(f"\n--- 6. TESTING FOR IMPROVEMENT ---")

    try:
        finetuned_model = SentenceTransformer(finetuned_model_path)
    except Exception as e:
        print(f"Could not load fine-tuned model: {e}")
        return

    # Define test pairs
    paper1 = """
    Toward an Evaluation Science for Generative AI Systems
    There is an increasing imperative to anticipate and understand the
performance and safety of generative AI systems in real-world deployment
contexts. However, the current evaluation ecosystem is insufficient: Commonly
used static benchmarks face validity challenges, and ad hoc case-by-case audits
rarely scale. In this piece, we advocate for maturing an evaluation science for
generative AI systems. While generative AI creates unique challenges for system
safety engineering and measurement science, the field can draw valuable
insights from the development of safety evaluation practices in other fields,
including transportation, aerospace, and pharmaceutical engineering. In
particular, we present three key lessons: Evaluation metrics must be applicable
to real-world performance, metrics must be iteratively refined, and evaluation
institutions and norms must be established. Applying these insights, we outline
a concrete path toward a more rigorous approach for evaluating generative AI
systems.
    """
    paper2 = """
    An Empirical Evaluation of Large Language Models on Consumer Health
  Questions
  This study evaluates the performance of several Large Language Models (LLMs)
on MedRedQA, a dataset of consumer-based medical questions and answers by
verified experts extracted from the AskDocs subreddit. While LLMs have shown
proficiency in clinical question answering (QA) benchmarks, their effectiveness
on real-world, consumer-based, medical questions remains less understood.
MedRedQA presents unique challenges, such as informal language and the need for
precise responses suited to non-specialist queries. To assess model
performance, responses were generated using five LLMs: GPT-4o mini, Llama 3.1:
70B, Mistral-123B, Mistral-7B, and Gemini-Flash. A cross-evaluation method was
used, where each model evaluated its responses as well as those of others to
minimize bias. The results indicated that GPT-4o mini achieved the highest
alignment with expert responses according to four out of the five models'
judges, while Mistral-7B scored lowest according to three out of five models'
judges. This study highlights the potential and limitations of current LLMs for
consumer health medical question answering, indicating avenues for further
development.

    """
    paper3 = """
    Off-Trajectory Reasoning: Can LLMs Collaborate on Reasoning Trajectory?
    Reasoning LLMs are trained to verbalize their reasoning process, yielding strong gains on complex tasks. This transparency also opens a promising direction: multiple reasoners can directly collaborate on each other's thinking within a shared trajectory, yielding better inference efficiency and exploration. A key prerequisite, however, is the ability to assess the usefulness and build on another model's partial thinking -- we call this off-trajectory reasoning. Our paper investigates a critical question: can standard solo-reasoning training pipelines deliver desired off-trajectory behaviors? We propose twin tests that capture the two extremes of the off-trajectory spectrum, namely Recoverability, which tests whether LLMs can backtrack from "distractions" induced by misleading reasoning traces, and Guidability, which tests their ability to build upon correct reasoning from stronger collaborators. Our study evaluates 15 open-weight LLMs (1.5B-32B) and reveals a counterintuitive finding -- "stronger" LLMs on benchmarks are often more fragile under distraction. Moreover, all models tested fail to effectively leverage guiding steps from collaborators on problems beyond their inherent capabilities with solve rates remaining under 9.2%. Finally, we conduct control studies to isolate the effects of three factors in post-training on these behaviors: the choice of distillation teacher, the use of RL, and data selection strategy. Our results provide actionable insights for training natively strong reasoning collaborators; e.g., we find that suboptimal recoverability behaviors of teacher models are transferred to distilled students even if the distillation trajectories are correct. Taken together, this work lays the groundwork for evaluating multi-model collaborations in shared reasoning trajectories and highlights the limitations of off-the-shelf reasoning LLMs.
    """

    paper4 = """
    Bidirectional Decoding: Improving Action Chunking via Guided Test-Time
  Sampling
  Predicting and executing a sequence of actions without intermediate
replanning, known as action chunking, is increasingly used in robot learning
from human demonstrations. Yet, its effects on the learned policy remain
inconsistent: some studies find it crucial for achieving strong results, while
others observe decreased performance. In this paper, we first dissect how
action chunking impacts the divergence between a learner and a demonstrator. We
find that action chunking allows the learner to better capture the temporal
dependencies in demonstrations but at the cost of reduced reactivity to
unexpected states. To address this tradeoff, we propose Bidirectional Decoding
(BID), a test-time inference algorithm that bridges action chunking with
closed-loop adaptation. At each timestep, BID samples multiple candidate
predictions and searches for the optimal one based on two criteria: (i)
backward coherence, which favors samples that align with previous decisions;
(ii) forward contrast, which seeks samples of high likelihood for future plans.
By coupling decisions within and across action chunks, BID promotes both
long-term consistency and short-term reactivity. Experimental results show that
our method boosts the performance of two state-of-the-art generative policies
across seven simulation benchmarks and two real-world tasks. Code and videos
are available at https://bid-robot.github.io.


    """
    pair_positive = [paper1, paper2]
    pair_negative = [paper3, paper4]

    # Test Original Model
    print(f"\n--- Testing ORIGINAL Model ({BASE_MODEL_NAME}) ---")
    emb_orig_pos = original_model.encode(pair_positive)
    emb_orig_neg = original_model.encode(pair_negative)
    print(f"Positive Pair Score: {cos_sim(emb_orig_pos[0], emb_orig_pos[1]).item():.4f}")
    print(f"Negative Pair Score: {cos_sim(emb_orig_neg[0], emb_orig_neg[1]).item():.4f}")

    # Test Fine-Tuned Model
    print(f"\n--- Testing FINE-TUNED Model ({finetuned_model_path}) ---")
    emb_tuned_pos = finetuned_model.encode(pair_positive)
    emb_tuned_neg = finetuned_model.encode(pair_negative)
    print(f"Positive Pair Score: {cos_sim(emb_tuned_pos[0], emb_tuned_pos[1]).item():.4f}")
    print(f"Negative Pair Score: {cos_sim(emb_tuned_neg[0], emb_tuned_neg[1]).item():.4f}")


In [8]:
!pip install pymilvus[milvus_lite]
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection
MILVUS_ALIAS = "default"
MILVUS_URI = "/content/drive/MyDrive/google_colab/arxiv/milvus_demo.db"
MILVUS_COLLECTION_NAME = "research_papers"
MILVUS_CUSTOM_COLLECTION_NAME = "research_papers_custom"
EMBEDDING_DIM = 384
MODEL_NAME = 'all-MiniLM-L6-v2'
MODEL_BATCH_SIZE = 1000

def setup_milvus_collection(collection_name):
    """Connects to Milvus and ensures the collection is created."""
    print("Connecting to Milvus...")
    connections.connect(alias=MILVUS_ALIAS, uri=MILVUS_URI)
    if utility.has_collection(collection_name):
        print(f"Collection '{collection_name}' already exists.")
        return Collection(collection_name)

    print(f"Creating collection '{collection_name}'...")
    fields = [
        FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=36),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM)
    ]
    schema = CollectionSchema(fields, "Research paper embeddings")
    collection = Collection(collection_name, schema)
    index_params = {"metric_type": "IP", "index_type": "IVF_FLAT", "params": {"nlist": 384}}
    collection.create_index(field_name="embedding", index_params=index_params)
    print("Milvus setup complete.")
    return collection

Collecting pymilvus[milvus_lite]
  Downloading pymilvus-2.6.4-py3-none-any.whl.metadata (6.6 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus[milvus_lite])
  Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl.metadata (10.0 kB)
Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl (55.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymilvus-2.6.4-py3-none-any.whl (278 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.0/278.0 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: milvus-lite, pymilvus
Successfully installed milvus-lite-2.5.1 pymilvus-2.6.4


In [9]:
def load_milvus(papers_df, model_name, collection_name):
  if os.path.exists(MILVUS_URI):
    print(f"Found existing file '{MILVUS_URI}'")
    return

  #   return
  # if utility.has_collection(collection_name):
  #   return
  model = SentenceTransformer(model_name, device="cuda")
  papers_dict = papers_df.set_index('id').to_dict('index')
  all_paper_ids = []
  all_embeddings = []
  texts_to_embed = []
  for paper_id in papers_dict:
    chunk = papers_dict[paper_id]
    text = f"{chunk["title"]}: {chunk["description"]}"
    # print(chunk)
    texts_to_embed.append(text)

    all_paper_ids.append(paper_id)
  print(len(texts_to_embed))
  print(texts_to_embed[0])
  all_embeddings = model.encode(
        texts_to_embed,
        show_progress_bar=True
    )

  data_to_insert = [all_paper_ids, all_embeddings]
  print(data_to_insert)
  collection = setup_milvus_collection(collection_name)
  # collection.insert(data_to_insert)
  num_records = len(all_paper_ids)
  for i in range(0, num_records, 1000):
    batch_ids = all_paper_ids[i:i + 1000]
    batch_embeddings = all_embeddings[i:i + 1000]

    # Prepare the batched data package
    _data_to_insert = [batch_ids, batch_embeddings]

    print(f"Inserting batch {i//1000 + 1} of {num_records//1000 + 1}...")

    # Insert the batch
    collection.insert(_data_to_insert)
  collection.flush()


In [10]:

# --- MAIN EXECUTION ---
async def main():
    # Step 1: Load data
    papers_df = load_data(INPUT_CSV_PATH)

    # Step 2: Generate pairs and get initial scores
    base_model = SentenceTransformer(BASE_MODEL_NAME)
    initial_pairs_df = generate_and_classify_pairs(papers_df, base_model, TOTAL_PAIRS_TO_GENERATE)

    # Step 3: Verify scores with LLM
    verified_pairs_df = await verify_with_llm(initial_pairs_df)

    # Step 4: Fine-tune the model with verified data
    if not verified_pairs_df.empty:
        fine_tune_model(verified_pairs_df)

        # Step 5: Test the improvement
        test_improvement(base_model, FINETUNED_MODEL_PATH)
    else:
        print("Skipping fine-tuning as no pairs were verified by the LLM.")

    load_milvus(papers_df, MODEL_NAME, MILVUS_COLLECTION_NAME)
    load_milvus(papers_df, FINETUNED_MODEL_PATH, MILVUS_CUSTOM_COLLECTION_NAME)






In [11]:
    # Ensure you have installed the necessary packages:
    # pip install pandas numpy sentence-transformers scikit-learn google-generativeai tqdm
    # Ensure you have installed the necessary packages:
    # pip install pandas numpy sentence-transformers scikit-learn google-generativeai tqdm
    await main()
    print("\nScript finished.")


--- 2. LOADING DATA from /content/drive/MyDrive/google_colab/arxiv/research_paper_202511302127.csv ---
Successfully loaded 122502 papers.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


--- 3. GENERATING/LOADING INITIAL PAIRS ---
Found existing file '/content/drive/MyDrive/google_colab/arxiv/initial_scored_pairs.csv'. Loading pairs from file.

--- 4. VERIFYING 100 PAIRS WITH LLM ---
Found existing file '/content/drive/MyDrive/google_colab/arxiv/llm_verified_pairs.csv'. Loading pairs from file.

--- 5. FINE-TUNING MODEL with 100000 examples ---
Found existing file '/content/drive/MyDrive/google_colab/arxiv/finetuned_model_from_analysis'

--- 6. TESTING FOR IMPROVEMENT ---

--- Testing ORIGINAL Model (all-MiniLM-L6-v2) ---
Positive Pair Score: 0.2431
Negative Pair Score: 0.4915

--- Testing FINE-TUNED Model (/content/drive/MyDrive/google_colab/arxiv/finetuned_model_from_analysis) ---
Positive Pair Score: 0.1933
Negative Pair Score: 0.1745
Found existing file '/content/drive/MyDrive/google_colab/arxiv/milvus_demo.db'
Found existing file '/content/drive/MyDrive/google_colab/arxiv/milvus_demo.db'

Script finished.


In [12]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
from pymilvus import Collection, connections

# --- CONFIGURATION (Assumed) ---
TOP_K = 20
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- UTILITY FUNCTIONS (Assumed or Simplified) ---

def load_data(path):
    # This is where your papers_df is loaded
    return pd.read_csv(path)
# ----------------------------------------------------------------------

def search_milvus_collections(
    queries: list,
    collection_names: list,
    paper_lookup: dict,
    search_params: dict,
    default_model: SentenceTransformer,
    finetuned_model: SentenceTransformer
) -> list:
    """
    Encodes a list of queries and searches multiple Milvus collections,
    returning results in a structured list of dictionaries.
    """
    print(f"--- Starting Semantic Search for {len(queries)} Queries ---")
    final_results = []

    # 1. Outer Loop: Iterate through each query
    for query in queries:
        print(f"\nProcessing Query: '{query}'")

        # Initialize the dictionary for the current query
        query_result = {"query": query, "base": [], "custom": []}

        # 2. Inner Loop: Iterate through each Milvus collection
        for name in collection_names:

            # --- Configure Model, Connection, and Result Key ---
            if name == MILVUS_COLLECTION_NAME:
                model_to_use = default_model
                alias = MILVUS_ALIAS
                uri = MILVUS_URI
                result_key = "base"
            elif name == MILVUS_CUSTOM_COLLECTION_NAME:
                model_to_use = finetuned_model
                alias = MILVUS_ALIAS
                uri = MILVUS_URI
                result_key = "custom"
            else:
                continue

            print(f"  Searching Collection: {name}")

            try:
                # Encode the search query
                query_vector = model_to_use.encode(query).tolist()
                query_vectors = [query_vector]

                collection = setup_milvus_collection(name)

                # The search
                results = collection.search(
                    data=query_vectors,
                    anns_field="embedding",
                    param=search_params,
                    limit=TOP_K,
                    output_fields=["id"] # Only need the ID to look up data
                )

                # 3. Process Results and Structure Output
                if results and results[0]:
                    for hit in results[0]:
                        paper_id = hit.entity.get('id', 'N/A')

                        # Use the fast dictionary lookup
                        paper_data = paper_lookup.get(paper_id, {})

                        # For Inner Product (IP), the distance is the similarity score (closer to 1 is better)
                        score = round(hit.distance, 4)

                        hit_data = {
                            "id": paper_id,
                            "title": paper_data.get('title', 'N/A'),
                            "description": paper_data.get('description', 'N/A')[:100],
                            "score": score
                        }
                        query_result[result_key].append(hit_data)

            except Exception as e:
                print(f"  ERROR searching {name}: {e}")

        final_results.append(query_result)

    print("--- Search Complete ---")
    return final_results

# ----------------------------------------------------------------------
# --- MAIN EXECUTION BLOCK ---
# ----------------------------------------------------------------------

# 1. Load Data for Lookup (Do this ONLY ONCE)
print(f"Loading data from {INPUT_CSV_PATH}...")
papers_df = load_data(INPUT_CSV_PATH)
# Convert DataFrame to a fast lookup dictionary:
paper_lookup = papers_df.set_index('id').to_dict('index')

# 2. Load Models (Do this ONLY ONCE)
default_model = SentenceTransformer(MODEL_NAME, device=DEVICE)
# Load the finetuned model (assuming FINETUNED_MODEL_PATH is defined)
finetuned_model = SentenceTransformer(FINETUNED_MODEL_PATH, device=DEVICE)
print("Models loaded successfully.")

# 3. Define Parameters
search_params = {
    "metric_type": "IP",
    "params": {"nprobe": 10}
}

# 4. Define Multiple Queries
# queries_list = [
#     "LLM and reasoning",
#     "Evaluation Gen AI",
#     "Attention multi head ",
#     "Medical imaging using deep learning",
#     "Preventing LLM hallucination",
#     "named entity recognition",
#     "Sample efficiency in offline reinforcement learning",
#     "Zero-shot object detection using contrastive learning",
#     "Knowledge graph grounding for LLM generation",
#     "Causality inference with unobserved confounding",
#     "Neural Radiance Fields (NeRF) for real-time synthesis",
#     "Heterogeneous graph neural networks for recommendation systems",
#     "Parameter-efficient fine-tuning (PEFT) methods comparison",
#     "Certified adversarial robustness in image classification",
#     "Multi-agent reinforcement learning equilibrium strategies",
#     "Spiking Neural Networks (SNNs) for edge computing",
#     "Retrieval-Augmented Generation (RAG) knowledge retrieval latency",
#     "Foundation models for remote sensing image segmentation",
#     "Explaining model predictions using SHAP and LIME in complex models",
#     "Neuro-symbolic AI fusion for automated theorem proving",
#     "Intrinsic motivation methods for sparse reward environments",

# ]

queries_list = ["Preventing LLM hallucination",
"Different types of transformer",
"Trust and Ethics in Small Language Models (SLMs)",
"Semantic Communication Alignment (e.g., \"Bayesian Weak-to-Strong Alignment\")"]
collections_to_search = [MILVUS_COLLECTION_NAME, MILVUS_CUSTOM_COLLECTION_NAME]

# 5. Call the Search Function
structured_results = search_milvus_collections(
    queries=queries_list,
    collection_names=collections_to_search,
    paper_lookup=paper_lookup,
    search_params=search_params,
    default_model=default_model,
    finetuned_model=finetuned_model
)

# 6. Display/Save Results
print("\n--- Final Structured Results ---")
# Print the structured data (you can convert this to JSON/CSV later)
import pandas as pd
import json

# NOTE: structured_results must be defined before this block.
# We use a placeholder here for the purpose of showing the complete logic.

def flatten_milvus_results(data):
    """Flattens the nested search results into a list of single-row dictionaries."""
    flat_data = []

    for item in data:
        query = item['query']

        # Process 'base' results
        for result in item['base']:
            row = {'Query': query, 'Source': 'Base', **result}
            flat_data.append(row)

        # Process 'custom' results
        for result in item['custom']:
            row = {'Query': query, 'Source': 'Custom', **result}
            flat_data.append(row)

    return flat_data

# 1. Flatten the data
flat_records = flatten_milvus_results(structured_results)
# 2. Convert to Pandas DataFrame
results_df = pd.DataFrame(flat_records)

# 2.5. CODE TO REMOVE DUPLICATES (Added)
# Duplicates are identified by the combination of Query, Source (Base/Custom), and Title.
# Since Milvus results are ordered by score, 'keep='first'' keeps the highest scoring result.
df_cleaned = results_df.drop_duplicates(
    subset=['Query', 'Source', 'title'],
    keep='first'
)

df_cleaned.head(10)

# 3. Clean and format columns (operating on the cleaned DataFrame)
df_cleaned['Description Snippet'] = df_cleaned['description'].str.replace('\\n', ' ', regex=False).str.strip().str[:100] + '...'
df_cleaned['Score'] = df_cleaned['score'].round(4)

# 4. Select and rename final columns for the table
df_final = df_cleaned[['Query', 'Source', 'Score', 'id', 'title', 'Description Snippet']]
df_final.rename(columns={'id': 'ID', 'title': 'Title'}, inplace=True)


# 5. Display the DataFrame
print("--- Final Search Results Table ---")
df_final.head(100)

Loading data from /content/drive/MyDrive/google_colab/arxiv/research_paper_202511302127.csv...
Models loaded successfully.
--- Starting Semantic Search for 4 Queries ---

Processing Query: 'Preventing LLM hallucination'
  Searching Collection: research_papers
Connecting to Milvus...
Collection 'research_papers' already exists.
  Searching Collection: research_papers_custom
Connecting to Milvus...
Collection 'research_papers_custom' already exists.

Processing Query: 'Different types of transformer'
  Searching Collection: research_papers
Connecting to Milvus...
Collection 'research_papers' already exists.
  Searching Collection: research_papers_custom
Connecting to Milvus...
Collection 'research_papers_custom' already exists.

Processing Query: 'Trust and Ethics in Small Language Models (SLMs)'
  Searching Collection: research_papers
Connecting to Milvus...
Collection 'research_papers' already exists.
  Searching Collection: research_papers_custom
Connecting to Milvus...
Collection 're

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Description Snippet'] = df_cleaned['description'].str.replace('\\n', ' ', regex=False).str.strip().str[:100] + '...'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Score'] = df_cleaned['score'].round(4)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.rename(columns={'id': 'ID', 'title': 'Title'}, inplace=True)


Unnamed: 0,Query,Source,Score,ID,Title,Description Snippet
0,Preventing LLM hallucination,Base,0.6025,e279ba01-6aa6-4819-a4bd-f37c466b33d9,Hallucination Detection and Hallucination Miti...,"Large language models (LLMs), including ChatGP..."
1,Preventing LLM hallucination,Base,0.5681,1296ec13-e495-47f8-a240-4cabf53cf15a,Hallucination Stations: On Some Basic Limitati...,In this paper we explore hallucinations and re...
3,Preventing LLM hallucination,Base,0.5619,ad5ab443-9a56-43dc-83ac-fe1cbcf1ac2e,Grounding the Ungrounded: A Spectral-Graph Fra...,Hallucinations in LLMs--especially in multimod...
5,Preventing LLM hallucination,Base,0.5595,ab82562e-2dcc-4a99-bf85-b7b5d5bb2e25,"Redefining ""Hallucination"" in LLMs: Towards a ...","In recent years, large language models (LLMs) ..."
6,Preventing LLM hallucination,Base,0.5525,f8d8a678-9958-44eb-8499-75f936c56b6e,LLM-based Agents Suffer from Hallucinations: A...,Driven by the rapid advancements of Large Lang...
8,Preventing LLM hallucination,Base,0.533,2c69ae91-b3f5-46f1-a094-0092673d9a2b,Probing LLM Hallucination from Within: Perturb...,"LLM hallucination, where unfaithful text is ge..."
10,Preventing LLM hallucination,Custom,0.7872,e279ba01-6aa6-4819-a4bd-f37c466b33d9,Hallucination Detection and Hallucination Miti...,"Large language models (LLMs), including ChatGP..."
11,Preventing LLM hallucination,Custom,0.7551,70a59924-50de-47c0-ae78-659601a8209b,Banishing LLM Hallucinations Requires Rethinki...,"Despite their powerful chat, coding, and reaso..."
13,Preventing LLM hallucination,Custom,0.7487,c1ef66f6-4681-4e5d-b156-785f2e0f03ab,Unsupervised Real-Time Hallucination Detection...,Hallucinations in large language models (LLMs)...
14,Preventing LLM hallucination,Custom,0.7332,5f645ecc-8e7b-45fe-96ee-df4845058378,"Look Within, Why LLMs Hallucinate: A Causal Pe...",The emergence of large language models (LLMs) ...


In [13]:
from pymilvus import utility, connections,MilvusClient

# 1. Connect to Milvus (if not already connected)
# connections.connect(alias="default", host="localhost", port="19530")
client = MilvusClient(uri=MILVUS_URI)
# 2. List all collections
collection_names = utility.list_collections()

# 3. Get the total count
total_collection_count = len(collection_names)

print(f"Total Collections: {total_collection_count}")
print(f"Collection Names: {collection_names}")
collection_stats = {}

for name in collection_names:
    # Use get_collection_stats or num_entities for row count
    # get_collection_stats is often preferred as it's more comprehensive
    stats = client.get_collection_stats(collection_name=name)

    # In Milvus, 'row_count' represents the number of entities
    entity_count = stats.get('row_count', 'N/A')

    collection_stats[name] = {
        "entity_count": entity_count
        # stats dictionary may contain other information like 'data_size' in newer versions
        # or specific configurations, but 'row_count' is the standard entity count.
    }

print("\nCollection Entity Counts:")
for name, stats in collection_stats.items():
    print(f"- **{name}**: {stats['entity_count']} entities")

Total Collections: 2
Collection Names: ['research_papers', 'research_papers_custom']

Collection Entity Counts:
- **research_papers**: 245004 entities
- **research_papers_custom**: 245004 entities
