In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from itertools import combinations
import numpy as np
import json
from tqdm import tqdm

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

# Loading Dataset : NFCorpus

In [None]:
from datasets import load_dataset

In [None]:
from tqdm import tqdm

In [None]:
dataset = load_dataset("BeIR/scifact", "corpus")

In [None]:
dataset

DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 3633
    })
})

In [None]:
dataset_q = load_dataset("BeIR/scifact", "queries")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

scifact.py:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

queries/queries/0000.parquet:   0%|          | 0.00/67.5k [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/1109 [00:00<?, ? examples/s]

In [None]:
dataset_q

DatasetDict({
    queries: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 1109
    })
})

In [None]:
dataset_qrel = load_dataset("BeIR/scifact-qrels", split="test")

README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

train.tsv:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/919 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/339 [00:00<?, ? examples/s]

In [None]:
dataset_qrel

Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 339
})

In [None]:
dataset_qrel[0]

{'query-id': 1, 'corpus-id': 31715818, 'score': 1}

In [None]:
from collections import defaultdict

# Initialize a defaultdict to store query-doc pairs
qrels = defaultdict(dict)

# Iterate over the dataset and populate the dictionary
for qrel in dataset_qrel:
    qrels[str(qrel["query-id"])][str(qrel["corpus-id"])] = qrel["score"]

# Convert defaultdict back to a regular dictionary if needed
qrels = dict(qrels)

In [None]:
qrels

{'1': {'31715818': 1},
 '3': {'14717500': 1},
 '5': {'13734012': 1},
 '13': {'1606628': 1},
 '36': {'5152028': 1, '11705328': 1},
 '42': {'18174210': 1},
 '48': {'13734012': 1},
 '49': {'5953485': 1},
 '50': {'12580014': 1},
 '51': {'45638119': 1},
 '53': {'45638119': 1},
 '54': {'49556906': 1},
 '56': {'4709641': 1},
 '57': {'4709641': 1},
 '70': {'5956380': 1, '4414547': 1},
 '72': {'6076903': 1},
 '75': {'4387784': 1},
 '94': {'1215116': 1},
 '99': {'18810195': 1},
 '100': {'4381486': 1},
 '113': {'6157837': 1},
 '115': {'33872649': 1},
 '118': {'6372244': 1},
 '124': {'4883040': 1},
 '127': {'21598000': 1},
 '128': {'8290953': 1},
 '129': {'27768226': 1},
 '130': {'27768226': 1},
 '132': {'7975937': 1},
 '133': {'38485364': 1,
  '6969753': 1,
  '17934082': 1,
  '16280642': 1,
  '12640810': 1},
 '137': {'26016929': 1},
 '141': {'6955746': 1, '14437255': 1},
 '142': {'10582939': 1},
 '143': {'10582939': 1},
 '146': {'10582939': 1},
 '148': {'1084345': 1},
 '163': {'18872233': 1},
 '1

In [None]:
print(f"Number of query-document pairs: {len(qrels)}")

Number of query-document pairs: 300


In [None]:
queries = {query["_id"]: query["text"] for query in dataset_q["queries"]}

In [None]:
print(f"Number of queries: {len(queries)}")


Number of queries: 1109


In [None]:
corpus = {doc["_id"]: doc['text'] for doc in dataset["corpus"]}

In [None]:
print(f"Corpus size: {len(corpus)}")
print(f"Number of queries: {len(queries)}")
print(f"Number of query-document pairs: {len(qrels)}")

Corpus size: 3633
Number of queries: 3237
Number of query-document pairs: 323


In [None]:
len(corpus)

3633

# Part 1

In [None]:
# Load generative model (GPT-3.5 or FLAN-T5)
def load_generative_model(model_name="google/flan-t5-large"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

In [None]:
# Segment document into chunks
def segment_document(document, chunk_size, overlap):
    tokens = document.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

In [None]:
def generate_pseudo_queries(chunks, tokenizer, model, top_k_ques=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move the model to the same device (GPU or CPU)

    # Prepare the prompts
    prompts = [
        f"Generate a detailed and nuanced question focusing on the most significant aspects of the passage:\n\n{chunk}\n\n"
        for chunk in chunks
    ]

    # Tokenize the input
    inputs = tokenizer(prompts, truncation=True, padding=True, return_tensors="pt", max_length=512)
    # print(inputs)


    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate pseudo queries without gradients
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=20,
            num_return_sequences=top_k_ques,  # Number of queries to generate per chunk
            do_sample=True,  # Use sampling to generate diverse queries
            top_k=5         # Limit to the top 10 most probable next tokens
        )

    # Decode the generated sequences into queries
    queries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Since `batch_decode` will return a flat list, we need to group them into batches of `top_k_ques`
    queries = [
        queries[i:i + top_k_ques]
        for i in range(0, len(queries), top_k_ques)
    ]

    return queries


In [None]:

# Diversity filtering using semantic similarity
def filter_diverse_queries(queries, similarity_model, threshold=0.8):
    filtered_queries = []
    embeddings = similarity_model.encode(queries, convert_to_tensor=True)
    for i, query in enumerate(queries):
        is_redundant = any(
            util.cos_sim(embeddings[i], embeddings[j]) > threshold for j in range(len(filtered_queries))
        )
        if not is_redundant:
            filtered_queries.append(query)
    return filtered_queries


In [None]:
def calculate_similarity_score(user_pseudo_query, document_pseudo_queries, similarity_model):
    """
    Calculate the similarity score for a query-document pair.

    Parameters:
    - query: The user query (string).
    - document_pseudo_queries: List of pseudo-queries for the document.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - max_similarity: Maximum similarity score between query and pseudo-queries.
    """
    # Compute embeddings for the query and pseudo-queries
    # query_embedding = similarity_model.encode(user_pseudo_query, convert_to_tensor=True)
    # # pseudo_query_embeddings = similarity_model.encode(document_pseudo_queries, convert_to_tensor=True)

    # # Compute cosine similarity scores
    # similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    similarity_model_gpu = similarity_model.to(device)

    # Compute embedding for the query on the specified device
    query_embedding = similarity_model_gpu.encode(
        user_pseudo_query,
        convert_to_tensor=True,
        device=device
    )

    if query_embedding.dim() == 1:
        query_embedding = query_embedding.unsqueeze(0)

    # Ensure pseudo-query embeddings are also on the same device
    document_pseudo_queries = document_pseudo_queries.to(device)
    # print(query_embedding.shape)
    # print(document_pseudo_queries.shape)

    # Compute cosine similarity scores
    similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)

    # Find the maximum similarity score
    max_similarity = torch.max(similarity_scores).item()
    return max_similarity



In [None]:
def rank_documents_by_query(query, document_pseudo_queries_embeddings, similarity_model):
    """
    Rank documents based on the similarity of their pseudo-queries to the query.

    Parameters:
    - query: The user query (string).
    - document_queries_dict: Dictionary where keys are document IDs and values are lists of pseudo-queries.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - ranked_scores: List of tuples (document_id, score) sorted by descending scores.
    """
    document_scores = {}
    for doc_id, pseudo_queries_embeddings in document_pseudo_queries_embeddings.items():
        # print(doc_id)
        # print(pseudo_queries)
        # print("yoo")
        score = calculate_similarity_score(query, pseudo_queries_embeddings, similarity_model)
        document_scores[doc_id] = score

    # Sort documents by scores in descending order
    ranked_scores = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_scores

In [None]:
# Model names
generative_model_name = "google/flan-t5-large"
similarity_model_name = "all-mpnet-base-v2"
chunk_size=500
overlap=0
threshold=0.8

In [None]:
similarity_model = SentenceTransformer(similarity_model_name)

In [None]:
# Main function to process the corpus
def process_corpus(corpus, generative_model_name, similarity_model, chunk_size, overlap, threshold):
    tokenizer, generative_model = load_generative_model(generative_model_name)

  # Initialize dictionaries to store the chunks and queries
    document_chunks = {}
    document_generated_queries = {}

    # Loop over each document in the corpus with tqdm progress bar
    for doc_id, document in tqdm(corpus.items(), desc="Processing Documents"):
        # Segment the document into chunks
        chunks = segment_document(document, chunk_size, overlap)

        # Store the chunks in a dictionary with the doc_id as the key
        document_chunks[doc_id] = chunks

        chunk_queries = []
        generated_queries = generate_pseudo_queries(chunks, tokenizer, generative_model)
        # print("Generated Queries")
        # print(generated_queries)
        # print("Generated Queries [0] ")
        # print(generated_queries[0])
        # break

        diverse_queries = filter_diverse_queries(generated_queries[0], similarity_model, threshold)
        chunk_queries.extend(diverse_queries)

        document_generated_queries[doc_id] = chunk_queries
    return document_generated_queries


# NOTE
Below is the code to call functions for generating the psuedo queries
You need not create the psuedo queries. Use the following link to find the generated psuedo queries which you can use to train the autoregressive model.

Link: https://drive.google.com/drive/folders/191D9QMsCVku2V1aCE0ZlkWvDqCzXlWQ3?usp=sharing

Check the files for their suffix to know which dataset they contain.

In [None]:
results = process_corpus(corpus, generative_model_name, similarity_model,chunk_size, overlap, threshold)

Processing Documents: 100%|██████████| 3633/3633 [57:48<00:00,  1.05it/s]


In [None]:
# with open("/content/drive/MyDrive/646Project/646Project/test/document_generated_queries_withdiversequeries_flan-t5-large_nfcorpus_DONOTRERUN.json", "w") as file:
#     json.dump(results, file, indent=4)
# print("Data saved as generated_queries.json")

Data saved as generated_queries.json


In [None]:
# type(results)

## PART 2

# Part 3

Read the file from the above link and load the content from the file in data object. then use it for evaluation metrics.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

file_path = "/content/drive/MyDrive/646Project/sharedfiles/document_generated_queries_withdiversequeries_flan-t5-large_scifact.json"

with open(file_path, "r") as file:
  data = json.load(file)


In [None]:
document_pseudo_queries = data

#NOTE

For now, we used the top 100 queries from qrels to test part 3, once model is trained. The generated psuedo query for the top 100 queries in qrels should be used instead.

In [None]:
queries_dict = list(qrels.items())[:100]

In [None]:
type(queries_dict)

list

In [None]:
result = [item[0] for item in queries_dict]

In [None]:

queries_dict

[('1', {'31715818': 1}),
 ('3', {'14717500': 1}),
 ('5', {'13734012': 1}),
 ('13', {'1606628': 1}),
 ('36', {'5152028': 1, '11705328': 1}),
 ('42', {'18174210': 1}),
 ('48', {'13734012': 1}),
 ('49', {'5953485': 1}),
 ('50', {'12580014': 1}),
 ('51', {'45638119': 1}),
 ('53', {'45638119': 1}),
 ('54', {'49556906': 1}),
 ('56', {'4709641': 1}),
 ('57', {'4709641': 1}),
 ('70', {'5956380': 1, '4414547': 1}),
 ('72', {'6076903': 1}),
 ('75', {'4387784': 1}),
 ('94', {'1215116': 1}),
 ('99', {'18810195': 1}),
 ('100', {'4381486': 1}),
 ('113', {'6157837': 1}),
 ('115', {'33872649': 1}),
 ('118', {'6372244': 1}),
 ('124', {'4883040': 1}),
 ('127', {'21598000': 1}),
 ('128', {'8290953': 1}),
 ('129', {'27768226': 1}),
 ('130', {'27768226': 1}),
 ('132', {'7975937': 1}),
 ('133',
  {'38485364': 1, '6969753': 1, '17934082': 1, '16280642': 1, '12640810': 1}),
 ('137', {'26016929': 1}),
 ('141', {'6955746': 1, '14437255': 1}),
 ('142', {'10582939': 1}),
 ('143', {'10582939': 1}),
 ('146', {'1058

In [None]:
type(result)

list

In [None]:
# with open("result.tsv", "w") as file:
#     for item in result:
#         file.write(item + "\n")

In [None]:

import json
with open("/content/drive/MyDrive/646Project/sharedfiles/scifact_cross_encoder_results.json", "r") as file:
    cross_encoder_res = json.load(file)

result_2 = list(cross_encoder_res.keys())


In [None]:
cross_encoder_res

{'871': ['Does obesity have a negative effect on mortality and causes?',
  'Do the results indicate that obesity and mortality have an important association?',
  'Is bariatric surgery associated with decreased rates of depression after surgery for obesity?',
  'Does cold induced obesity in mice increase lipid deposition and plaque development?',
  'What are the findings of this study of underweight children worldwide?',
  'What were the main outcomes of this study?',
  'What was the result of this study?',
  'What are the main conclusions?',
  'How was the prevalence of underweight predicted to change between 1990 and 2015?',
  'What are the findings of this study?',
  'What was the main result of the study?',
  'What were the findings of this study?',
  'Is higher body-mass index associated with increased risk of mortality : a prospective',
  'What are the main conclusions of this study?',
  'Which region saw the largest change in the prevalence of underweight children?',
  'What is t

In [None]:
a_minus_b = list(set(result) - set(result_2))
b_minus_a = list(set(result_2) - set(result))

# Output the results
print("A - B:", a_minus_b)
print("B - A:", b_minus_a)

A - B: ['501', '491']
B - A: ['1291', '871']


In [None]:
def calculate_similarity_score(user_pseudo_query, document_pseudo_queries, similarity_model):
    """
    Calculate the similarity score for a query-document pair.

    Parameters:
    - query: The user query (string).
    - document_pseudo_queries: List of pseudo-queries for the document.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - max_similarity: Maximum similarity score between query and pseudo-queries.
    """
    # Compute embeddings for the query and pseudo-queries
    # query_embedding = similarity_model.encode(user_pseudo_query, convert_to_tensor=True)
    # # pseudo_query_embeddings = similarity_model.encode(document_pseudo_queries, convert_to_tensor=True)

    # # Compute cosine similarity scores
    # similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    similarity_model_gpu = similarity_model.to(device)

    # Compute embedding for the query on the specified device
    query_embedding = similarity_model_gpu.encode(
        user_pseudo_query,
        convert_to_tensor=True,
        device=device
    )

    if query_embedding.dim() == 1:
        query_embedding = query_embedding.unsqueeze(0)

    # Ensure pseudo-query embeddings are also on the same device
    document_pseudo_queries = document_pseudo_queries.to(device)
    # print(query_embedding.shape)
    # print(document_pseudo_queries.shape)

    # Compute cosine similarity scores
    similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)

    # Find the maximum similarity score
    max_similarity = torch.max(similarity_scores).item()
    return max_similarity



In [None]:
def rank_documents_by_query(query, document_pseudo_queries_embeddings, similarity_model):
    """
    Rank documents based on the similarity of their pseudo-queries to the query.

    Parameters:
    - query: The user query (string).
    - document_queries_dict: Dictionary where keys are document IDs and values are lists of pseudo-queries.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - ranked_scores: List of tuples (document_id, score) sorted by descending scores.
    """
    document_scores = {}
    for doc_id, pseudo_queries_embeddings in document_pseudo_queries_embeddings.items():
        # print(doc_id)
        # print(pseudo_queries)
        # print("yoo")
        score = calculate_similarity_score(query, pseudo_queries_embeddings, similarity_model)
        document_scores[doc_id] = score

    # Sort documents by scores in descending order
    ranked_scores = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_scores

In [None]:
# queries_dict = list(qrels.items())[:2]
test_dict = {}
document_pseudo_queries_embeddings={}
device = "cuda" if torch.cuda.is_available() else "cpu"
similarity_model_gpu = similarity_model.to(device)

for doc_id, pseudo_queries in tqdm(document_pseudo_queries.items()):
  # print(pseudo_queries)
  # print(len(pseudo_queries))
  pseudo_query_embeddings = similarity_model_gpu.encode(pseudo_queries, convert_to_tensor=True, device = device)
  document_pseudo_queries_embeddings[doc_id] = pseudo_query_embeddings
  # # print(doc_id)
  # print(pseudo_query_embeddings[0])
  # print(len(pseudo_query_embeddings))
  # # print(document_pseudo_queries_embeddings)
  # break



100%|██████████| 5183/5183 [01:21<00:00, 63.54it/s]


In [None]:
document_pseudo_queries

{'4983': ['Are cerebral white matter architecture differences between preterm infants at term and full-term infants',
  'Does diffusion-weighted magnetic resonance imaging in the developing brain of preterm and full-',
  'Does diffusion tensor analysis allow a quantitative measurement of water diffusion in developing brain tissue',
  'Does diffusion-weighted MRI sequence with diffusion tensor analysis delineate'],
 '5836': ['Is myeloid-derived suppressor cell expansion a result of an interaction between S',
  'What is the role of myeloid derived suppressor cells in the development of mye',
  'Does early forced maturation of myeloid-derived suppressor cells restore hematop',
  'What is the effect of S100A9/CD33 signaling on the development of my'],
 '7912': ['What are ID elements?',
  'What are the two steps involved in the retroposition process?',
  'What is the name of the gene whose transcripts are able to self-prime',
  'Which regulatory steps are involved in the amplification of SI

  # NOTE
  Replace the commented line in below code to get the generated psuedo query from the model for top 100 queries in qrels.

  *user_pseudo_query = generate_pseudo_query(queries[query_id])*


  Suggestion : Instead of generating inside the loop, it would be better to generate for all 100 queries at once and then used as needed


In [None]:
# import json
# with open("/content/drive/MyDrive/646Project/sharedfiles/validation_result_nfcorpus.json", "r") as file:
#     user_pseudo_queries = json.load(file)

In [None]:
# for query_id, doc_score in tqdm(queries_dict):
#   # print(query_id)
#   user_pseudo_query = generate_pseudo_query(queries[query_id], api_key)

In [None]:
# cross_encoder_res['PLAIN-186']

In [None]:
# queries_dict

In [None]:
# queries

In [None]:
# queries["PLAIN-2"]

In [None]:
from collections import defaultdict

# Data preprocessing to create reverse mapping
reverse_mapping = defaultdict(list)
for doc_id, pseudo_queries in document_pseudo_queries.items():
    for pseudo_query in pseudo_queries:
        reverse_mapping[pseudo_query.lower()].append(doc_id)




In [None]:
reverse_mapping

defaultdict(list,
            {'are cerebral white matter architecture differences between preterm infants at term and full-term infants': ['4983'],
             'does diffusion-weighted magnetic resonance imaging in the developing brain of preterm and full-': ['4983'],
             'does diffusion tensor analysis allow a quantitative measurement of water diffusion in developing brain tissue': ['4983'],
             'does diffusion-weighted mri sequence with diffusion tensor analysis delineate': ['4983'],
             'is myeloid-derived suppressor cell expansion a result of an interaction between s': ['5836'],
             'what is the role of myeloid derived suppressor cells in the development of mye': ['5836'],
             'does early forced maturation of myeloid-derived suppressor cells restore hematop': ['5836'],
             'what is the effect of s100a9/cd33 signaling on the development of my': ['5836'],
             'what are id elements?': ['7912'],
             'what are the

In [None]:
# Initialize the result dictionary
result = {}

# Iterate over validation queries with optimized lookup
for query_id, queries in cross_encoder_res.items():
    doc_ids = []
    for query in queries:
        # Check for matches in reverse_mapping (case-insensitive)
        for pseudo_query, associated_doc_ids in reverse_mapping.items():
            if query.lower() in pseudo_query:
                doc_ids.extend(associated_doc_ids)
                break  # Stop after finding the first match
    # Remove duplicates and keep the order of document IDs
    result[query_id] = list(dict.fromkeys(doc_ids))

# Print the result
# print(result)

In [None]:
result

{'871': ['195689316',
  '18872233',
  '970012',
  '1606628',
  '6173523',
  '418246',
  '10874408',
  '3752408',
  '14717500',
  '1887056',
  '5222182',
  '14079881',
  '17914395',
  '20334484',
  '25900857',
  '44624045',
  '45638119',
  '9274291',
  '16495649',
  '20326526',
  '10536636',
  '803312',
  '11172205',
  '12030318',
  '27438378',
  '40558887',
  '1191830',
  '1398021',
  '1944452',
  '2158500',
  '2810997',
  '4311206',
  '8087082',
  '8780599',
  '9539248',
  '10546779',
  '11256632',
  '11615242',
  '13097856',
  '13515165',
  '14550841',
  '18574146',
  '20045514',
  '20697217',
  '21301090',
  '23557241',
  '23594156',
  '25041967',
  '25049067',
  '26990001',
  '31543713',
  '33723822',
  '33989422',
  '36033696',
  '38664102',
  '39048693',
  '39390206',
  '39970500',
  '40935722',
  '41354899',
  '42291761',
  '44660616',
  '117907685',
  '6490571',
  '8672737',
  '13329980',
  '33872649',
  '6504953',
  '13948920',
  '21993510',
  '145383432'],
 '1291': ['56893404

In [None]:
# qrel_pseudo ={}
# for query_id, doc_score in tqdm(queries_dict):
#   # print(query_id)
#   user_pseudo_query = data[query_id]
#   qrel_pseudo[query_id] = user_pseudo_query
#   # print("Original Query:",queries[query_id])
#   # print("New Query:",user_pseudo_query)
#   # break
#   # user_pseudo_query = user_pseudo_queries[query_id]
#   if(user_pseudo_query == ""):
#     print("error")
#     break
#   doc_rank_for_query = rank_documents_by_query(user_pseudo_query, document_pseudo_queries_embeddings, similarity_model)
#   # print(type(doc_rank_for_query))
#   test_dict[query_id]={}
#   for doc_id, score in doc_rank_for_query:
#     test_dict[query_id][doc_id] = float(score)


In [None]:
# document_pseudo_queries_embeddings

In [None]:
test_dict = result

In [None]:
# top_n = 10

In [None]:
# top_documents = {}
# for query_id, doc_scores in test_dict.items():
#   # Directly take the top N documents (already sorted)
#   top_documents[query_id] = dict(list(doc_scores.items())[:top_n])

In [None]:
# top_documents

In [None]:
!pip install pytrec_eval



In [None]:
import pytrec_eval

In [None]:
test_dict

{'871': ['195689316',
  '18872233',
  '970012',
  '1606628',
  '6173523',
  '418246',
  '10874408',
  '3752408',
  '14717500',
  '1887056',
  '5222182',
  '14079881',
  '17914395',
  '20334484',
  '25900857',
  '44624045',
  '45638119',
  '9274291',
  '16495649',
  '20326526',
  '10536636',
  '803312',
  '11172205',
  '12030318',
  '27438378',
  '40558887',
  '1191830',
  '1398021',
  '1944452',
  '2158500',
  '2810997',
  '4311206',
  '8087082',
  '8780599',
  '9539248',
  '10546779',
  '11256632',
  '11615242',
  '13097856',
  '13515165',
  '14550841',
  '18574146',
  '20045514',
  '20697217',
  '21301090',
  '23557241',
  '23594156',
  '25041967',
  '25049067',
  '26990001',
  '31543713',
  '33723822',
  '33989422',
  '36033696',
  '38664102',
  '39048693',
  '39390206',
  '39970500',
  '40935722',
  '41354899',
  '42291761',
  '44660616',
  '117907685',
  '6490571',
  '8672737',
  '13329980',
  '33872649',
  '6504953',
  '13948920',
  '21993510',
  '145383432'],
 '1291': ['56893404

In [None]:
test_dict = {
    query_id: {doc_id: len(docs) - rank for rank, doc_id in enumerate(docs)}
    for query_id, docs in test_dict.items()
}

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'P.3,5,10', 'recall.3,5,10', 'ndcg_cut.3,5,10', 'map_cut.3,5,10'})
result = evaluator.evaluate(test_dict)
print(result)
metrics = ['P','ndcg_cut', 'recall', 'map_cut']
cutoffs = [3,5,10]
scores = {f'{metric}_{cutoff}': 0 for metric in metrics for cutoff in cutoffs}
for key in result:
  for metric in metrics:
    for cutoff in cutoffs:
      scores[f'{metric}_{cutoff}'] += result[key][f'{metric}_{cutoff}']
run_length = len(test_dict)
for score in scores:
  scores[score] /= run_length

{'1': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0, 'ndcg_cut_10': 0.0, 'map_cut_3': 0.0, 'map_cut_5': 0.0, 'map_cut_10': 0.0}, '3': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0, 'ndcg_cut_10': 0.0, 'map_cut_3': 0.0, 'map_cut_5': 0.0, 'map_cut_10': 0.0}, '5': {'P_3': 0.3333333333333333, 'P_5': 0.2, 'P_10': 0.1, 'recall_3': 1.0, 'recall_5': 1.0, 'recall_10': 1.0, 'ndcg_cut_3': 1.0, 'ndcg_cut_5': 1.0, 'ndcg_cut_10': 1.0, 'map_cut_3': 1.0, 'map_cut_5': 1.0, 'map_cut_10': 1.0}, '13': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0, 'ndcg_cut_10': 0.0, 'map_cut_3': 0.0, 'map_cut_5': 0.0, 'map_cut_10': 0.0}, '36': {'P_3': 0.6666666666666666, 'P_5': 0.4, 'P_10': 0.2, 'recall_3': 1.0, 'recall_5': 1.0, 'recall_10': 1.0, 'ndcg_cut_3': 1.0, 'ndcg_cut_5': 1.0

In [None]:
scores

{'P_3': 0.29333333333333306,
 'P_5': 0.18999999999999972,
 'P_10': 0.09699999999999985,
 'ndcg_cut_3': 0.7877694492825056,
 'ndcg_cut_5': 0.8031112971037259,
 'ndcg_cut_10': 0.808063930769914,
 'recall_3': 0.7856666666666666,
 'recall_5': 0.8255,
 'recall_10': 0.838,
 'map_cut_3': 0.7755555555555554,
 'map_cut_5': 0.7891555555555556,
 'map_cut_10': 0.792488888888889}