In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from itertools import combinations
import numpy as np
import json
from tqdm import tqdm

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

# Loading Dataset : NFCorpus

In [None]:
from datasets import load_dataset

In [None]:
from tqdm import tqdm

In [None]:
dataset = load_dataset("BeIR/nfcorpus", "corpus")

In [None]:
dataset

DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 3633
    })
})

In [None]:
dataset_q = load_dataset("BeIR/nfcorpus", "queries")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

nfcorpus.py:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

queries/queries/0000.parquet:   0%|          | 0.00/84.8k [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/3237 [00:00<?, ? examples/s]

In [None]:
dataset_q

DatasetDict({
    queries: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 3237
    })
})

In [None]:
dataset_qrel = load_dataset("BeIR/nfcorpus-qrels", split="test")

README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

train.tsv:   0%|          | 0.00/2.50M [00:00<?, ?B/s]

dev.tsv:   0%|          | 0.00/258k [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/280k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/110575 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11385 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12334 [00:00<?, ? examples/s]

In [None]:
dataset_qrel

Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 12334
})

In [None]:
dataset_qrel[0]

{'query-id': 'PLAIN-2', 'corpus-id': 'MED-2427', 'score': 2}

In [None]:
from collections import defaultdict

# Initialize a defaultdict to store query-doc pairs
qrels = defaultdict(dict)

# Iterate over the dataset and populate the dictionary
for qrel in dataset_qrel:
    qrels[qrel["query-id"]][qrel["corpus-id"]] = qrel["score"]

# Convert defaultdict back to a regular dictionary if needed
qrels = dict(qrels)

In [None]:
qrels

{'PLAIN-2': {'MED-2427': 2,
  'MED-10': 2,
  'MED-2429': 2,
  'MED-2430': 2,
  'MED-2431': 2,
  'MED-14': 2,
  'MED-2432': 2,
  'MED-2428': 1,
  'MED-2440': 1,
  'MED-2434': 1,
  'MED-2435': 1,
  'MED-2436': 1,
  'MED-2437': 1,
  'MED-2438': 1,
  'MED-2439': 1,
  'MED-3597': 1,
  'MED-3598': 1,
  'MED-3599': 1,
  'MED-4556': 1,
  'MED-4559': 1,
  'MED-4560': 1,
  'MED-4828': 1,
  'MED-4829': 1,
  'MED-4830': 1},
 'PLAIN-12': {'MED-2513': 2,
  'MED-5237': 2,
  'MED-2517': 2,
  'MED-2518': 2,
  'MED-2519': 2,
  'MED-2520': 2,
  'MED-2521': 2,
  'MED-2514': 1,
  'MED-2943': 1,
  'MED-5322': 1,
  'MED-5323': 1,
  'MED-5324': 1,
  'MED-5325': 1,
  'MED-5326': 1,
  'MED-5327': 1,
  'MED-5328': 1,
  'MED-5329': 1,
  'MED-5330': 1,
  'MED-5331': 1,
  'MED-5332': 1,
  'MED-5333': 1,
  'MED-5334': 1,
  'MED-5335': 1,
  'MED-5363': 1,
  'MED-5337': 1,
  'MED-5338': 1,
  'MED-5339': 1,
  'MED-5340': 1,
  'MED-5341': 1,
  'MED-5342': 1},
 'PLAIN-23': {'MED-2644': 2,
  'MED-2646': 2,
  'MED-2651': 2

In [None]:
print(f"Number of query-document pairs: {len(qrels)}")

Number of query-document pairs: 323


In [None]:
queries = {query["_id"]: query["text"] for query in dataset_q["queries"]}

In [None]:
print(f"Number of queries: {len(queries)}")


Number of queries: 3237


In [None]:
corpus = {doc["_id"]: doc['text'] for doc in dataset["corpus"]}

In [None]:
print(f"Corpus size: {len(corpus)}")
print(f"Number of queries: {len(queries)}")
print(f"Number of query-document pairs: {len(qrels)}")

Corpus size: 3633
Number of queries: 3237
Number of query-document pairs: 323


In [None]:
len(corpus)

3633

# Part 1

In [None]:
# Load generative model (GPT-3.5 or FLAN-T5)
def load_generative_model(model_name="google/flan-t5-large"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

In [None]:
# Segment document into chunks
def segment_document(document, chunk_size, overlap):
    tokens = document.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

In [None]:
def generate_pseudo_queries(chunks, tokenizer, model, top_k_ques=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move the model to the same device (GPU or CPU)

    # Prepare the prompts
    prompts = [
        f"Generate a detailed and nuanced question focusing on the most significant aspects of the passage:\n\n{chunk}\n\n"
        for chunk in chunks
    ]

    # Tokenize the input
    inputs = tokenizer(prompts, truncation=True, padding=True, return_tensors="pt", max_length=512)
    # print(inputs)


    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate pseudo queries without gradients
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=20,
            num_return_sequences=top_k_ques,  # Number of queries to generate per chunk
            do_sample=True,  # Use sampling to generate diverse queries
            top_k=5         # Limit to the top 10 most probable next tokens
        )

    # Decode the generated sequences into queries
    queries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Since `batch_decode` will return a flat list, we need to group them into batches of `top_k_ques`
    queries = [
        queries[i:i + top_k_ques]
        for i in range(0, len(queries), top_k_ques)
    ]

    return queries


In [None]:

# Diversity filtering using semantic similarity
def filter_diverse_queries(queries, similarity_model, threshold=0.8):
    filtered_queries = []
    embeddings = similarity_model.encode(queries, convert_to_tensor=True)
    for i, query in enumerate(queries):
        is_redundant = any(
            util.cos_sim(embeddings[i], embeddings[j]) > threshold for j in range(len(filtered_queries))
        )
        if not is_redundant:
            filtered_queries.append(query)
    return filtered_queries


In [None]:
def calculate_similarity_score(user_pseudo_query, document_pseudo_queries, similarity_model):
    """
    Calculate the similarity score for a query-document pair.

    Parameters:
    - query: The user query (string).
    - document_pseudo_queries: List of pseudo-queries for the document.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - max_similarity: Maximum similarity score between query and pseudo-queries.
    """
    # Compute embeddings for the query and pseudo-queries
    # query_embedding = similarity_model.encode(user_pseudo_query, convert_to_tensor=True)
    # # pseudo_query_embeddings = similarity_model.encode(document_pseudo_queries, convert_to_tensor=True)

    # # Compute cosine similarity scores
    # similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    similarity_model_gpu = similarity_model.to(device)

    # Compute embedding for the query on the specified device
    query_embedding = similarity_model_gpu.encode(
        user_pseudo_query,
        convert_to_tensor=True,
        device=device
    )

    if query_embedding.dim() == 1:
        query_embedding = query_embedding.unsqueeze(0)

    # Ensure pseudo-query embeddings are also on the same device
    document_pseudo_queries = document_pseudo_queries.to(device)
    # print(query_embedding.shape)
    # print(document_pseudo_queries.shape)

    # Compute cosine similarity scores
    similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)

    # Find the maximum similarity score
    max_similarity = torch.max(similarity_scores).item()
    return max_similarity



In [None]:
def rank_documents_by_query(query, document_pseudo_queries_embeddings, similarity_model):
    """
    Rank documents based on the similarity of their pseudo-queries to the query.

    Parameters:
    - query: The user query (string).
    - document_queries_dict: Dictionary where keys are document IDs and values are lists of pseudo-queries.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - ranked_scores: List of tuples (document_id, score) sorted by descending scores.
    """
    document_scores = {}
    for doc_id, pseudo_queries_embeddings in document_pseudo_queries_embeddings.items():
        # print(doc_id)
        # print(pseudo_queries)
        # print("yoo")
        score = calculate_similarity_score(query, pseudo_queries_embeddings, similarity_model)
        document_scores[doc_id] = score

    # Sort documents by scores in descending order
    ranked_scores = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_scores

In [None]:
# Model names
generative_model_name = "google/flan-t5-large"
similarity_model_name = "all-mpnet-base-v2"
chunk_size=500
overlap=0
threshold=0.8

In [None]:
similarity_model = SentenceTransformer(similarity_model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Main function to process the corpus
def process_corpus(corpus, generative_model_name, similarity_model, chunk_size, overlap, threshold):
    tokenizer, generative_model = load_generative_model(generative_model_name)

  # Initialize dictionaries to store the chunks and queries
    document_chunks = {}
    document_generated_queries = {}

    # Loop over each document in the corpus with tqdm progress bar
    for doc_id, document in tqdm(corpus.items(), desc="Processing Documents"):
        # Segment the document into chunks
        chunks = segment_document(document, chunk_size, overlap)

        # Store the chunks in a dictionary with the doc_id as the key
        document_chunks[doc_id] = chunks

        chunk_queries = []
        generated_queries = generate_pseudo_queries(chunks, tokenizer, generative_model)
        # print("Generated Queries")
        # print(generated_queries)
        # print("Generated Queries [0] ")
        # print(generated_queries[0])
        # break

        diverse_queries = filter_diverse_queries(generated_queries[0], similarity_model, threshold)
        chunk_queries.extend(diverse_queries)

        document_generated_queries[doc_id] = chunk_queries
    return document_generated_queries


# NOTE
Below is the code to call functions for generating the psuedo queries
You need not create the psuedo queries. Use the following link to find the generated psuedo queries which you can use to train the autoregressive model.

Link: https://drive.google.com/drive/folders/191D9QMsCVku2V1aCE0ZlkWvDqCzXlWQ3?usp=sharing

Check the files for their suffix to know which dataset they contain.

In [None]:
results = process_corpus(corpus, generative_model_name, similarity_model,chunk_size, overlap, threshold)

Processing Documents: 100%|██████████| 3633/3633 [57:48<00:00,  1.05it/s]


In [None]:
# with open("/content/drive/MyDrive/646Project/646Project/test/document_generated_queries_withdiversequeries_flan-t5-large_nfcorpus_DONOTRERUN.json", "w") as file:
#     json.dump(results, file, indent=4)
# print("Data saved as generated_queries.json")

Data saved as generated_queries.json


In [None]:
# type(results)

## PART 2

In [None]:
from openai import OpenAI
from nltk.corpus import wordnet
import nltk

# Download WordNet data if not already available
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from openai import OpenAI

In [None]:
# Function to get synonyms for a word
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    return list(synonyms)

# Function to rephrase using GPT-4
# def rephrase_query_with_gpt(query, api_key):
#     prompt=f"Rephrase the following query to make it clearer and more comprehensive: {query}",
#     # openai.api_key = api_key
#     client = OpenAI(api_key=api_key)
#     response = client.chat.completions.create(
#         model="gpt-3.5-turbo",
#         messages=[{"role": "user", "content": prompt}],
#         max_tokens=50,
#         temperature=0.7
#     )
#     # response = openai.Completion.create(
#     #     engine="gpt-4o-mini",  # Use the appropriate GPT model engine
#     #     prompt=f"Rephrase the following query to make it clearer and more comprehensive: {query}",
#     #     max_tokens=50,
#     #     temperature=0.7
#     # )
#     # response.choices[0].message.content.strip("[]")
#     # return response.choices[0].text.strip("[]")
#     return response.json()

def rephrase_query_with_gpt(query, api_key):
    from openai import OpenAI

    # Ensure prompt is a string, not a tuple
    prompt = f"Rephrase the following query to make it clearer and more comprehensive: {query}"

    # Initialize the OpenAI client
    client = OpenAI(api_key=api_key)

    # Make the request to the Chat Completions endpoint
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Use the appropriate model
        messages=[{"role": "user", "content": prompt}],
        max_tokens=50,
        temperature=0.7
    )

    # Extract and return the response text
    # return response.choices[0].message['content'].strip()
    # return response.json()
    # return response["choices"][0]["message"]["content"]
    return response.choices[0].message.content.strip()

# Main function to generate a pseudo query
def generate_pseudo_query(query, api_key):
    words = query.split()
    expanded_query = []

    # Add synonyms for each word
    for word in words:
        expanded_query.append(word)  # Include the original word
        synonyms = get_synonyms(word)
        if synonyms:
            expanded_query.extend(synonyms[:2])  # Add up to 2 synonyms for each word to avoid over-expansion

    # Create a combined query string
    expanded_query_str = " ".join(set(expanded_query))  # Remove duplicates by using `set`
    # print("Expanded Query: ",expanded_query_str)
    # Rephrase using GPT
    rephrased_query = rephrase_query_with_gpt(expanded_query_str, api_key)

    return rephrased_query

# Part 3

Read the file from the above link and load the content from the file in data object. then use it for evaluation metrics.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

file_path = "/content/drive/MyDrive/646Project/sharedfiles/document_generated_queries_withdiversequeries_flan-t5-large_nfcorpus.json"

with open(file_path, "r") as file:
  data = json.load(file)


In [None]:
document_pseudo_queries = data

In [None]:
api_key="API_KEY"

#NOTE

For now, we used the top 100 queries from qrels to test part 3, once model is trained. The generated psuedo query for the top 100 queries in qrels should be used instead.

In [None]:
queries_dict = list(qrels.items())[:100]

In [None]:
# type(queries_dict)

list

In [None]:
# result = [item[0] for item in queries_dict]

In [None]:
# queries_dict

[('PLAIN-2',
  {'MED-2427': 2,
   'MED-10': 2,
   'MED-2429': 2,
   'MED-2430': 2,
   'MED-2431': 2,
   'MED-14': 2,
   'MED-2432': 2,
   'MED-2428': 1,
   'MED-2440': 1,
   'MED-2434': 1,
   'MED-2435': 1,
   'MED-2436': 1,
   'MED-2437': 1,
   'MED-2438': 1,
   'MED-2439': 1,
   'MED-3597': 1,
   'MED-3598': 1,
   'MED-3599': 1,
   'MED-4556': 1,
   'MED-4559': 1,
   'MED-4560': 1,
   'MED-4828': 1,
   'MED-4829': 1,
   'MED-4830': 1}),
 ('PLAIN-12',
  {'MED-2513': 2,
   'MED-5237': 2,
   'MED-2517': 2,
   'MED-2518': 2,
   'MED-2519': 2,
   'MED-2520': 2,
   'MED-2521': 2,
   'MED-2514': 1,
   'MED-2943': 1,
   'MED-5322': 1,
   'MED-5323': 1,
   'MED-5324': 1,
   'MED-5325': 1,
   'MED-5326': 1,
   'MED-5327': 1,
   'MED-5328': 1,
   'MED-5329': 1,
   'MED-5330': 1,
   'MED-5331': 1,
   'MED-5332': 1,
   'MED-5333': 1,
   'MED-5334': 1,
   'MED-5335': 1,
   'MED-5363': 1,
   'MED-5337': 1,
   'MED-5338': 1,
   'MED-5339': 1,
   'MED-5340': 1,
   'MED-5341': 1,
   'MED-5342': 1}),
 

In [None]:
# type(result_2)

list

In [None]:
# with open("result.tsv", "w") as file:
#     for item in result:
#         file.write(item + "\n")

In [None]:

# import json
# with open("validation_result.json", "r") as file:
#     data = json.load(file)

# result_2 = list(data.keys())


In [None]:
# a_minus_b = list(set(result) - set(result_2))
# b_minus_a = list(set(result_2) - set(result))

# # Output the results
# print("A - B:", a_minus_b)
# print("B - A:", b_minus_a)

A - B: []
B - A: []


In [None]:
def calculate_similarity_score(user_pseudo_query, document_pseudo_queries, similarity_model):
    """
    Calculate the similarity score for a query-document pair.

    Parameters:
    - query: The user query (string).
    - document_pseudo_queries: List of pseudo-queries for the document.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - max_similarity: Maximum similarity score between query and pseudo-queries.
    """
    # Compute embeddings for the query and pseudo-queries
    # query_embedding = similarity_model.encode(user_pseudo_query, convert_to_tensor=True)
    # # pseudo_query_embeddings = similarity_model.encode(document_pseudo_queries, convert_to_tensor=True)

    # # Compute cosine similarity scores
    # similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    similarity_model_gpu = similarity_model.to(device)

    # Compute embedding for the query on the specified device
    query_embedding = similarity_model_gpu.encode(
        user_pseudo_query,
        convert_to_tensor=True,
        device=device
    )

    if query_embedding.dim() == 1:
        query_embedding = query_embedding.unsqueeze(0)

    # Ensure pseudo-query embeddings are also on the same device
    document_pseudo_queries = document_pseudo_queries.to(device)
    # print(query_embedding.shape)
    # print(document_pseudo_queries.shape)

    # Compute cosine similarity scores
    similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)

    # Find the maximum similarity score
    max_similarity = torch.max(similarity_scores).item()
    return max_similarity



In [None]:
def rank_documents_by_query(query, document_pseudo_queries_embeddings, similarity_model):
    """
    Rank documents based on the similarity of their pseudo-queries to the query.

    Parameters:
    - query: The user query (string).
    - document_queries_dict: Dictionary where keys are document IDs and values are lists of pseudo-queries.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - ranked_scores: List of tuples (document_id, score) sorted by descending scores.
    """
    document_scores = {}
    for doc_id, pseudo_queries_embeddings in document_pseudo_queries_embeddings.items():
        # print(doc_id)
        # print(pseudo_queries)
        # print("yoo")
        score = calculate_similarity_score(query, pseudo_queries_embeddings, similarity_model)
        document_scores[doc_id] = score

    # Sort documents by scores in descending order
    ranked_scores = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_scores

In [None]:
# queries_dict = list(qrels.items())[:2]
test_dict = {}
document_pseudo_queries_embeddings={}
device = "cuda" if torch.cuda.is_available() else "cpu"
similarity_model_gpu = similarity_model.to(device)

for doc_id, pseudo_queries in tqdm(document_pseudo_queries.items()):
  # print(pseudo_queries)
  # print(len(pseudo_queries))
  pseudo_query_embeddings = similarity_model_gpu.encode(pseudo_queries, convert_to_tensor=True, device = device)
  document_pseudo_queries_embeddings[doc_id] = pseudo_query_embeddings
  # # print(doc_id)
  # print(pseudo_query_embeddings[0])
  # print(len(pseudo_query_embeddings))
  # # print(document_pseudo_queries_embeddings)
  # break



100%|██████████| 3633/3633 [00:59<00:00, 61.13it/s]


  # NOTE
  Replace the commented line in below code to get the generated psuedo query from the model for top 100 queries in qrels.

  *user_pseudo_query = generate_pseudo_query(queries[query_id])*


  Suggestion : Instead of generating inside the loop, it would be better to generate for all 100 queries at once and then used as needed


In [None]:
# import json
# with open("/content/drive/MyDrive/646Project/sharedfiles/validation_result_nfcorpus.json", "r") as file:
#     user_pseudo_queries = json.load(file)

In [None]:
# for query_id, doc_score in tqdm(queries_dict):
#   # print(query_id)
#   user_pseudo_query = generate_pseudo_query(queries[query_id], api_key)

In [None]:
qrel_pseudo ={}
for query_id, doc_score in tqdm(queries_dict):
  # print(query_id)
  user_pseudo_query = generate_pseudo_query(queries[query_id], api_key)
  qrel_pseudo[query_id] = user_pseudo_query
  # print("Original Query:",queries[query_id])
  # print("New Query:",user_pseudo_query)
  # break
  # user_pseudo_query = user_pseudo_queries[query_id]
  # if(user_pseudo_query == ""):
  #   print("error")
  #   break
  doc_rank_for_query = rank_documents_by_query(user_pseudo_query, document_pseudo_queries_embeddings, similarity_model)
  # print(type(doc_rank_for_query))
  test_dict[query_id]={}
  for doc_id, score in doc_rank_for_query:
    test_dict[query_id][doc_id] = float(score)


100%|██████████| 100/100 [1:41:00<00:00, 60.61s/it]


In [None]:
test_dict

{'PLAIN-2': {'MED-2429': 0.7871258854866028,
  'MED-2431': 0.7832151055335999,
  'MED-10': 0.7245281338691711,
  'MED-14': 0.7119804620742798,
  'MED-2440': 0.6406773328781128,
  'MED-1193': 0.6092426776885986,
  'MED-2428': 0.6002280116081238,
  'MED-4828': 0.5861200094223022,
  'MED-4827': 0.5646580457687378,
  'MED-4562': 0.5541160106658936,
  'MED-3862': 0.5429224371910095,
  'MED-3856': 0.5417379140853882,
  'MED-5117': 0.5340003967285156,
  'MED-3833': 0.5338658690452576,
  'MED-2439': 0.5259724855422974,
  'MED-2437': 0.5239633321762085,
  'MED-4162': 0.5224952697753906,
  'MED-4160': 0.516817033290863,
  'MED-4829': 0.5000696182250977,
  'MED-3844': 0.4996461272239685,
  'MED-3832': 0.4989967346191406,
  'MED-2103': 0.4973810315132141,
  'MED-2530': 0.49178946018218994,
  'MED-1564': 0.49010396003723145,
  'MED-3791': 0.49007511138916016,
  'MED-5351': 0.4899674355983734,
  'MED-4751': 0.48215368390083313,
  'MED-2770': 0.47836658358573914,
  'MED-2423': 0.4777323305606842,
  '

In [None]:
top_n = 10

In [None]:
top_documents = {}
for query_id, doc_scores in test_dict.items():
  # Directly take the top N documents (already sorted)
  top_documents[query_id] = dict(list(doc_scores.items())[:top_n])

In [None]:
top_documents

{'PLAIN-2': {'MED-2429': 0.7871258854866028,
  'MED-2431': 0.7832151055335999,
  'MED-10': 0.7245281338691711,
  'MED-14': 0.7119804620742798,
  'MED-2440': 0.6406773328781128,
  'MED-1193': 0.6092426776885986,
  'MED-2428': 0.6002280116081238,
  'MED-4828': 0.5861200094223022,
  'MED-4827': 0.5646580457687378,
  'MED-4562': 0.5541160106658936},
 'PLAIN-12': {'MED-4711': 0.5862591862678528,
  'MED-2694': 0.4255580008029938,
  'MED-1200': 0.42543095350265503,
  'MED-1523': 0.41171830892562866,
  'MED-707': 0.40782004594802856,
  'MED-4888': 0.4068473279476166,
  'MED-3923': 0.40514615178108215,
  'MED-4113': 0.3983815312385559,
  'MED-4060': 0.3963720202445984,
  'MED-868': 0.39631175994873047},
 'PLAIN-23': {'MED-2994': 0.5975139737129211,
  'MED-2661': 0.5311875343322754,
  'MED-4072': 0.5102498531341553,
  'MED-1174': 0.5019991397857666,
  'MED-118': 0.5006904602050781,
  'MED-1172': 0.4966268241405487,
  'MED-1961': 0.48701605200767517,
  'MED-2404': 0.4861348569393158,
  'MED-1164'

In [None]:
!pip install pytrec_eval

Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308218 sha256=11a473e54b8f4bb56a91b7a86839a520ca32ddacf910cc00b23144d668329d07
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5


In [None]:
import pytrec_eval

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'P.3,5,10', 'recall.3,5,10', 'ndcg_cut.3,5,10', 'map_cut.3,5,10'})
result = evaluator.evaluate(top_documents)
print(result)
metrics = ['P','ndcg_cut', 'recall', 'map_cut']
cutoffs = [3,5,10]
scores = {f'{metric}_{cutoff}': 0 for metric in metrics for cutoff in cutoffs}
for key in result:
  for metric in metrics:
    for cutoff in cutoffs:
      scores[f'{metric}_{cutoff}'] += result[key][f'{metric}_{cutoff}']
run_length = len(test_dict)
for score in scores:
  scores[score] /= run_length

{'PLAIN-2': {'P_3': 1.0, 'P_5': 1.0, 'P_10': 0.7, 'recall_3': 0.125, 'recall_5': 0.20833333333333334, 'recall_10': 0.2916666666666667, 'ndcg_cut_3': 1.0, 'ndcg_cut_5': 0.934397461243829, 'ndcg_cut_10': 0.7527738488042618, 'map_cut_3': 0.125, 'map_cut_5': 0.20833333333333334, 'map_cut_10': 0.2805059523809524}, 'PLAIN-12': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0, 'ndcg_cut_10': 0.0, 'map_cut_3': 0.0, 'map_cut_5': 0.0, 'map_cut_10': 0.0}, 'PLAIN-23': {'P_3': 0.3333333333333333, 'P_5': 0.4, 'P_10': 0.2, 'recall_3': 0.011111111111111112, 'recall_5': 0.022222222222222223, 'recall_10': 0.022222222222222223, 'ndcg_cut_3': 0.2960819109658652, 'ndcg_cut_5': 0.34519134224686937, 'ndcg_cut_10': 0.22400556151517553, 'map_cut_3': 0.005555555555555556, 'map_cut_5': 0.01, 'map_cut_10': 0.01}, 'PLAIN-33': {'P_3': 0.6666666666666666, 'P_5': 0.8, 'P_10': 0.4, 'recall_3': 0.0625, 'recall_5': 0.125, 'recall_10': 0.125, '

In [None]:
scores

{'P_3': 0.31666666666666665,
 'P_5': 0.2659999999999998,
 'P_10': 0.21199999999999997,
 'ndcg_cut_3': 0.34037818785922247,
 'ndcg_cut_5': 0.3215064194692127,
 'ndcg_cut_10': 0.2999607714925095,
 'recall_3': 0.09484654627113795,
 'recall_5': 0.11848272333041479,
 'recall_10': 0.15016120566295968,
 'map_cut_3': 0.08059241803223088,
 'map_cut_5': 0.09323562834664453,
 'map_cut_10': 0.10957271484635536}