In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from itertools import combinations
import numpy as np
import json
from tqdm import tqdm

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

# Loading Dataset : NFCorpus

In [None]:
from datasets import load_dataset

In [None]:
from tqdm import tqdm

In [None]:
dataset = load_dataset("BeIR/nfcorpus", "corpus")

In [None]:
dataset

In [None]:
dataset_q = load_dataset("BeIR/nfcorpus", "queries")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

nfcorpus.py:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

queries/queries/0000.parquet:   0%|          | 0.00/84.8k [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/3237 [00:00<?, ? examples/s]

In [None]:
dataset_q

DatasetDict({
    queries: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 3237
    })
})

In [None]:
dataset_qrel = load_dataset("BeIR/nfcorpus-qrels", split="test")

README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

train.tsv:   0%|          | 0.00/2.50M [00:00<?, ?B/s]

dev.tsv:   0%|          | 0.00/258k [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/280k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/110575 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11385 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12334 [00:00<?, ? examples/s]

In [None]:
dataset_qrel

Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 12334
})

In [None]:
dataset_qrel[0]

{'query-id': 'PLAIN-2', 'corpus-id': 'MED-2427', 'score': 2}

In [None]:
from collections import defaultdict

# Initialize a defaultdict to store query-doc pairs
qrels = defaultdict(dict)

# Iterate over the dataset and populate the dictionary
for qrel in dataset_qrel:
    qrels[qrel["query-id"]][qrel["corpus-id"]] = qrel["score"]

# Convert defaultdict back to a regular dictionary if needed
qrels = dict(qrels)

In [None]:
qrels

{'PLAIN-2': {'MED-2427': 2,
  'MED-10': 2,
  'MED-2429': 2,
  'MED-2430': 2,
  'MED-2431': 2,
  'MED-14': 2,
  'MED-2432': 2,
  'MED-2428': 1,
  'MED-2440': 1,
  'MED-2434': 1,
  'MED-2435': 1,
  'MED-2436': 1,
  'MED-2437': 1,
  'MED-2438': 1,
  'MED-2439': 1,
  'MED-3597': 1,
  'MED-3598': 1,
  'MED-3599': 1,
  'MED-4556': 1,
  'MED-4559': 1,
  'MED-4560': 1,
  'MED-4828': 1,
  'MED-4829': 1,
  'MED-4830': 1},
 'PLAIN-12': {'MED-2513': 2,
  'MED-5237': 2,
  'MED-2517': 2,
  'MED-2518': 2,
  'MED-2519': 2,
  'MED-2520': 2,
  'MED-2521': 2,
  'MED-2514': 1,
  'MED-2943': 1,
  'MED-5322': 1,
  'MED-5323': 1,
  'MED-5324': 1,
  'MED-5325': 1,
  'MED-5326': 1,
  'MED-5327': 1,
  'MED-5328': 1,
  'MED-5329': 1,
  'MED-5330': 1,
  'MED-5331': 1,
  'MED-5332': 1,
  'MED-5333': 1,
  'MED-5334': 1,
  'MED-5335': 1,
  'MED-5363': 1,
  'MED-5337': 1,
  'MED-5338': 1,
  'MED-5339': 1,
  'MED-5340': 1,
  'MED-5341': 1,
  'MED-5342': 1},
 'PLAIN-23': {'MED-2644': 2,
  'MED-2646': 2,
  'MED-2651': 2

In [None]:
print(f"Number of query-document pairs: {len(qrels)}")

Number of query-document pairs: 323


In [None]:
queries = {query["_id"]: query["text"] for query in dataset_q["queries"]}

In [None]:
print(f"Number of queries: {len(queries)}")


Number of queries: 3237


In [None]:
corpus = {doc["_id"]: doc['text'] for doc in dataset["corpus"]}

In [None]:
print(f"Corpus size: {len(corpus)}")
print(f"Number of queries: {len(queries)}")
print(f"Number of query-document pairs: {len(qrels)}")

Corpus size: 3633
Number of queries: 3237
Number of query-document pairs: 323


In [None]:
len(corpus)

3633

# Part 1

In [None]:
# Load generative model (GPT-3.5 or FLAN-T5)
def load_generative_model(model_name="google/flan-t5-large"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

In [None]:
# Segment document into chunks
def segment_document(document, chunk_size, overlap):
    tokens = document.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

In [None]:
def generate_pseudo_queries(chunks, tokenizer, model, top_k_ques=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move the model to the same device (GPU or CPU)

    # Prepare the prompts
    prompts = [
        f"Generate a detailed and nuanced question focusing on the most significant aspects of the passage:\n\n{chunk}\n\n"
        for chunk in chunks
    ]

    # Tokenize the input
    inputs = tokenizer(prompts, truncation=True, padding=True, return_tensors="pt", max_length=512)
    # print(inputs)


    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate pseudo queries without gradients
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=20,
            num_return_sequences=top_k_ques,  # Number of queries to generate per chunk
            do_sample=True,  # Use sampling to generate diverse queries
            top_k=5         # Limit to the top 10 most probable next tokens
        )

    # Decode the generated sequences into queries
    queries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Since `batch_decode` will return a flat list, we need to group them into batches of `top_k_ques`
    queries = [
        queries[i:i + top_k_ques]
        for i in range(0, len(queries), top_k_ques)
    ]

    return queries


In [None]:

# Diversity filtering using semantic similarity
def filter_diverse_queries(queries, similarity_model, threshold=0.8):
    filtered_queries = []
    embeddings = similarity_model.encode(queries, convert_to_tensor=True)
    for i, query in enumerate(queries):
        is_redundant = any(
            util.cos_sim(embeddings[i], embeddings[j]) > threshold for j in range(len(filtered_queries))
        )
        if not is_redundant:
            filtered_queries.append(query)
    return filtered_queries


In [None]:
def calculate_similarity_score(user_pseudo_query, document_pseudo_queries, similarity_model):
    """
    Calculate the similarity score for a query-document pair.

    Parameters:
    - query: The user query (string).
    - document_pseudo_queries: List of pseudo-queries for the document.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - max_similarity: Maximum similarity score between query and pseudo-queries.
    """
    # Compute embeddings for the query and pseudo-queries
    # query_embedding = similarity_model.encode(user_pseudo_query, convert_to_tensor=True)
    # # pseudo_query_embeddings = similarity_model.encode(document_pseudo_queries, convert_to_tensor=True)

    # # Compute cosine similarity scores
    # similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    similarity_model_gpu = similarity_model.to(device)

    # Compute embedding for the query on the specified device
    query_embedding = similarity_model_gpu.encode(
        user_pseudo_query,
        convert_to_tensor=True,
        device=device
    )

    if query_embedding.dim() == 1:
        query_embedding = query_embedding.unsqueeze(0)

    # Ensure pseudo-query embeddings are also on the same device
    document_pseudo_queries = document_pseudo_queries.to(device)
    # print(query_embedding.shape)
    # print(document_pseudo_queries.shape)

    # Compute cosine similarity scores
    similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)

    # Find the maximum similarity score
    max_similarity = torch.max(similarity_scores).item()
    return max_similarity



In [None]:
def rank_documents_by_query(query, document_pseudo_queries_embeddings, similarity_model):
    """
    Rank documents based on the similarity of their pseudo-queries to the query.

    Parameters:
    - query: The user query (string).
    - document_queries_dict: Dictionary where keys are document IDs and values are lists of pseudo-queries.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - ranked_scores: List of tuples (document_id, score) sorted by descending scores.
    """
    document_scores = {}
    for doc_id, pseudo_queries_embeddings in document_pseudo_queries_embeddings.items():
        # print(doc_id)
        # print(pseudo_queries)
        # print("yoo")
        score = calculate_similarity_score(query, pseudo_queries_embeddings, similarity_model)
        document_scores[doc_id] = score

    # Sort documents by scores in descending order
    ranked_scores = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_scores

In [None]:
# Model names
generative_model_name = "google/flan-t5-large"
similarity_model_name = "all-mpnet-base-v2"
chunk_size=500
overlap=0
threshold=0.8

In [None]:
similarity_model = SentenceTransformer(similarity_model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Main function to process the corpus
def process_corpus(corpus, generative_model_name, similarity_model, chunk_size, overlap, threshold):
    tokenizer, generative_model = load_generative_model(generative_model_name)

  # Initialize dictionaries to store the chunks and queries
    document_chunks = {}
    document_generated_queries = {}

    # Loop over each document in the corpus with tqdm progress bar
    for doc_id, document in tqdm(corpus.items(), desc="Processing Documents"):
        # Segment the document into chunks
        chunks = segment_document(document, chunk_size, overlap)

        # Store the chunks in a dictionary with the doc_id as the key
        document_chunks[doc_id] = chunks

        chunk_queries = []
        generated_queries = generate_pseudo_queries(chunks, tokenizer, generative_model)
        # print("Generated Queries")
        # print(generated_queries)
        # print("Generated Queries [0] ")
        # print(generated_queries[0])
        # break

        diverse_queries = filter_diverse_queries(generated_queries[0], similarity_model, threshold)
        chunk_queries.extend(diverse_queries)

        document_generated_queries[doc_id] = chunk_queries
    return document_generated_queries


# NOTE
Below is the code to call functions for generating the psuedo queries
You need not create the psuedo queries. Use the following link to find the generated psuedo queries which you can use to train the autoregressive model.

Link: https://drive.google.com/drive/folders/191D9QMsCVku2V1aCE0ZlkWvDqCzXlWQ3?usp=sharing

Check the files for their suffix to know which dataset they contain.

In [None]:
results = process_corpus(corpus, generative_model_name, similarity_model,chunk_size, overlap, threshold)

Processing Documents: 100%|██████████| 3633/3633 [57:48<00:00,  1.05it/s]


In [None]:
# with open("/content/drive/MyDrive/646Project/646Project/test/document_generated_queries_withdiversequeries_flan-t5-large_nfcorpus_DONOTRERUN.json", "w") as file:
#     json.dump(results, file, indent=4)
# print("Data saved as generated_queries.json")

Data saved as generated_queries.json


In [None]:
# type(results)

# Part 3

Read the file from the above link and load the content from the file in data object. then use it for evaluation metrics.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = "/content/drive/MyDrive/646Project/sharedfiles/document_generated_queries_withdiversequeries_flan-t5-large_nfcorpus.json"

with open(file_path, "r") as file:
  data = json.load(file)


In [None]:
document_pseudo_queries = data

#NOTE

For now, we used the top 100 queries from qrels to test part 3, once model is trained. The generated psuedo query for the top 100 queries in qrels should be used instead.

In [None]:
queries_dict = list(qrels.items())[:100]

In [None]:
# type(queries_dict)

In [None]:
# result = [item[0] for item in queries_dict]

In [None]:
# queries_dict

In [None]:
# type(result_2)

In [None]:
# with open("result.tsv", "w") as file:
#     for item in result:
#         file.write(item + "\n")

In [None]:

# import json
# with open("validation_result.json", "r") as file:
#     data = json.load(file)

# result_2 = list(data.keys())


In [None]:
# a_minus_b = list(set(result) - set(result_2))
# b_minus_a = list(set(result_2) - set(result))

# # Output the results
# print("A - B:", a_minus_b)
# print("B - A:", b_minus_a)

In [None]:
def calculate_similarity_score(user_pseudo_query, document_pseudo_queries, similarity_model):
    """
    Calculate the similarity score for a query-document pair.

    Parameters:
    - query: The user query (string).
    - document_pseudo_queries: List of pseudo-queries for the document.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - max_similarity: Maximum similarity score between query and pseudo-queries.
    """
    # Compute embeddings for the query and pseudo-queries
    # query_embedding = similarity_model.encode(user_pseudo_query, convert_to_tensor=True)
    # # pseudo_query_embeddings = similarity_model.encode(document_pseudo_queries, convert_to_tensor=True)

    # # Compute cosine similarity scores
    # similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    similarity_model_gpu = similarity_model.to(device)

    # Compute embedding for the query on the specified device
    query_embedding = similarity_model_gpu.encode(
        user_pseudo_query,
        convert_to_tensor=True,
        device=device
    )

    if query_embedding.dim() == 1:
        query_embedding = query_embedding.unsqueeze(0)

    # Ensure pseudo-query embeddings are also on the same device
    document_pseudo_queries = document_pseudo_queries.to(device)
    # print(query_embedding.shape)
    # print(document_pseudo_queries.shape)

    # Compute cosine similarity scores
    similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)

    # Find the maximum similarity score
    max_similarity = torch.max(similarity_scores).item()
    return max_similarity



In [None]:
def rank_documents_by_query(query, document_pseudo_queries_embeddings, similarity_model):
    """
    Rank documents based on the similarity of their pseudo-queries to the query.

    Parameters:
    - query: The user query (string).
    - document_queries_dict: Dictionary where keys are document IDs and values are lists of pseudo-queries.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - ranked_scores: List of tuples (document_id, score) sorted by descending scores.
    """
    document_scores = {}
    for doc_id, pseudo_queries_embeddings in document_pseudo_queries_embeddings.items():
        # print(doc_id)
        # print(pseudo_queries)
        # print("yoo")
        score = calculate_similarity_score(query, pseudo_queries_embeddings, similarity_model)
        document_scores[doc_id] = score

    # Sort documents by scores in descending order
    ranked_scores = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_scores

In [None]:
# queries_dict = list(qrels.items())[:2]
test_dict = {}
document_pseudo_queries_embeddings={}
device = "cuda" if torch.cuda.is_available() else "cpu"
similarity_model_gpu = similarity_model.to(device)

for doc_id, pseudo_queries in tqdm(document_pseudo_queries.items()):
  # print(pseudo_queries)
  # print(len(pseudo_queries))
  pseudo_query_embeddings = similarity_model_gpu.encode(pseudo_queries, convert_to_tensor=True, device = device)
  document_pseudo_queries_embeddings[doc_id] = pseudo_query_embeddings
  # # print(doc_id)
  # print(pseudo_query_embeddings[0])
  # print(len(pseudo_query_embeddings))
  # # print(document_pseudo_queries_embeddings)
  # break



100%|██████████| 3633/3633 [01:05<00:00, 55.62it/s]


  # NOTE
  Replace the commented line in below code to get the generated psuedo query from the model for top 100 queries in qrels.

  *user_pseudo_query = generate_pseudo_query(queries[query_id])*


  Suggestion : Instead of generating inside the loop, it would be better to generate for all 100 queries at once and then used as needed


In [None]:
import json
with open("/content/drive/MyDrive/646Project/sharedfiles/gpt2_id_response_mapping_nfcorpus.json", "r") as file:
    user_pseudo_queries = json.load(file)

In [None]:
user_pseudo_queries

{'PLAIN-2': 'Do Cholesterol Statin Drugs Cause Breast Cancer?  What is the main idea of this article, and what are some examples from other areas that contribute to its success as an effective prevention tool ?\n The authors discuss how cholesterol can be prevented in humans by lowering body fat level. In particular they focus on atherosclerosis , which results when more than one person has abnormal levels of LDL or low HDL . Does high dietary intake increase risk factors associated with obesity among men but not women who have elevated triglycerides (T)? How does adiposity influence disease progression after coronary heart surgery compared to controls during follow up time : A systematic review \xa0 Study comparing cardiovascular diseases found higher prevalence rates across countries at baseline - Finland vs Norway & Japan More evidence was available regarding diet quality over years 8 th through 11 d Dietary intervention could reduce mortality incidence rate Across Countries Weight 

In [None]:
for query_id, doc_score in tqdm(queries_dict):
  # print(query_id)
  # user_pseudo_query = generate_pseudo_query(queries[query_id])
  user_pseudo_query = user_pseudo_queries[query_id]
  if(user_pseudo_query == ""):
    print("error")
    break
  doc_rank_for_query = rank_documents_by_query(user_pseudo_query, document_pseudo_queries_embeddings, similarity_model)
  # print(type(doc_rank_for_query))
  test_dict[query_id]={}
  for doc_id, score in doc_rank_for_query:
    test_dict[query_id][doc_id] = float(score)


100%|██████████| 100/100 [2:09:28<00:00, 77.68s/it]


In [None]:
test_dict

{'PLAIN-2': {'MED-1565': 0.707399845123291,
  'MED-1885': 0.7018277049064636,
  'MED-1564': 0.6989020705223083,
  'MED-1721': 0.683612585067749,
  'MED-2440': 0.6627028584480286,
  'MED-2423': 0.6595010161399841,
  'MED-10': 0.6580274105072021,
  'MED-3862': 0.6540544033050537,
  'MED-2429': 0.6504778861999512,
  'MED-5111': 0.6469268202781677,
  'MED-5341': 0.6467967629432678,
  'MED-2428': 0.6349299550056458,
  'MED-5352': 0.6301500797271729,
  'MED-2716': 0.6286353468894958,
  'MED-4069': 0.628533661365509,
  'MED-3697': 0.6265231966972351,
  'MED-5293': 0.6228154897689819,
  'MED-2305': 0.6214445233345032,
  'MED-2162': 0.620708703994751,
  'MED-1151': 0.6190222501754761,
  'MED-1884': 0.6171309947967529,
  'MED-3833': 0.6170613765716553,
  'MED-2431': 0.6160262227058411,
  'MED-2228': 0.6157392263412476,
  'MED-3699': 0.613813579082489,
  'MED-3799': 0.6125792860984802,
  'MED-1529': 0.6047584414482117,
  'MED-3723': 0.6010489463806152,
  'MED-5196': 0.5977063775062561,
  'MED-475

In [None]:
top_n = 10

In [None]:
top_documents = {}
for query_id, doc_scores in test_dict.items():
  # Directly take the top N documents (already sorted)
  top_documents[query_id] = dict(list(doc_scores.items())[:top_n])

In [None]:
top_documents

{'PLAIN-2': {'MED-1565': 0.707399845123291,
  'MED-1885': 0.7018277049064636,
  'MED-1564': 0.6989020705223083,
  'MED-1721': 0.683612585067749,
  'MED-2440': 0.6627028584480286,
  'MED-2423': 0.6595010161399841,
  'MED-10': 0.6580274105072021,
  'MED-3862': 0.6540544033050537,
  'MED-2429': 0.6504778861999512,
  'MED-5111': 0.6469268202781677},
 'PLAIN-12': {'MED-2311': 0.6688379645347595,
  'MED-4915': 0.6640568971633911,
  'MED-1530': 0.6621805429458618,
  'MED-3558': 0.6571531295776367,
  'MED-1375': 0.6561846733093262,
  'MED-2577': 0.6552624106407166,
  'MED-1106': 0.6504529118537903,
  'MED-2402': 0.6427600979804993,
  'MED-5027': 0.6316820979118347,
  'MED-3541': 0.626696765422821},
 'PLAIN-23': {'MED-5035': 0.7078609466552734,
  'MED-4983': 0.6993905305862427,
  'MED-1106': 0.689231276512146,
  'MED-1530': 0.6872288584709167,
  'MED-3723': 0.6820054650306702,
  'MED-2439': 0.6716691255569458,
  'MED-1103': 0.6706472635269165,
  'MED-5034': 0.6700124144554138,
  'MED-4915': 0.6

In [None]:
queries['PLAIN-1050']

'Dr. Dean Ornish'

In [None]:
user_pseudo_queries['PLAIN-1050']

"Dr. Dean Ornish  What are the most common causes of food allergies? Are there any other factors that can contribute to this problem? Does dietary intake have an effect on allergic diseases, especially those related to animal proteins and fats? How does exposure affect risk in relation with meat consumption? Is diet rich or poor source environmental exposures associated more than others as compared among US adults who consume poultry workers ? Do organic sources increase mortality rates by 25% from disease , obesity , diabetes and cardiovascular events - do they reduce adherence & medical intervention (especially when children)? Who is at increased Risk : The author's questionnaire\n\xa0has been used extensively across all health care industries worldwide over several years but has not yet become standardised . In what areas was it developed based upon previous research done using data collected during epidemiological studies conducted between 2001 and 2010 only 4 countries were includ

In [None]:
user_pseudo_queries['PLAIN-12']

'Exploiting Autophagy to Live Longer  What are the two major metabolites of cruciferous vegetables? Are they associated with higher mortality risk and cardiovascular disease in humans versus animals ? Is an increase in death due from cancer or stroke related to reduced intake of tomato/chai foods , especially on vegetarians compared also vegetarian subjects, but not poultry workers who have been exposed to contaminated meat products : A review article by Arthard et al.? How is consumption influenced during life cycle changes affecting human health & well being (ABI)? Does diet change influence incidence of breast cancers among men vs women - Dietary patterns that may affect lifestyle factors as measured through food questionnaire. Which chemicals contribute to plasma concentrations above what was normal at baseline after eating more than half daily vegetable sources combined?, The results show dietary intakes vary between people living near different countries based on population chara

In [None]:
queries['PLAIN-12']

'Exploiting Autophagy to Live Longer'

In [None]:
queries['PLAIN-91']

'Chronic Headaches and Pork Parasites'

In [None]:
user_pseudo_queries['PLAIN-91']

"Chronic Headaches and Pork Parasites  What is the relationship between inflammation, oxidative stress resistance or age of onset in adults? Are there differences among individuals with higher body mass index (BMI) vs. lower BMI groups on different risk factors that contribute to cardiovascular disease prevention worldwide? Is dietary intake associated more consistently across all life stages compared to nonpoverty levels versus rural population living conditions ? Does consumption differ by ethnicity as well as health status , do they show similar prevalence rates at baseline years after birth : EPIC survey 2001 & 2003 ; NHANES 2004/2004; US Census 1999a et al., 2000b)? Do vegetarians have better mortality from heart diseases than other ethnicities but not hypertension related deaths due to alcohol use during pregnancy accordingto Zizek's study.? How does vegetarian diet affect cancer incidence over time based on data collected since 1991 - 2010 comparing meat eaters who were previous

In [None]:
top_documents['PLAIN-1050']

{'MED-2649': 0.7445867657661438,
 'MED-2646': 0.6818550229072571,
 'MED-2350': 0.6510166525840759,
 'MED-900': 0.6473385095596313,
 'MED-4550': 0.6409481763839722,
 'MED-3702': 0.6340570449829102,
 'MED-4433': 0.6288087964057922,
 'MED-2382': 0.6253468990325928,
 'MED-2369': 0.6252251267433167,
 'MED-3790': 0.619795024394989}

In [None]:
qrels['PLAIN-1050']

{'MED-2763': 1,
 'MED-3780': 1,
 'MED-3784': 1,
 'MED-3242': 1,
 'MED-3244': 1,
 'MED-3790': 1,
 'MED-3245': 1,
 'MED-5337': 1,
 'MED-3250': 1,
 'MED-3251': 1,
 'MED-3247': 1,
 'MED-3248': 1,
 'MED-3249': 1,
 'MED-3254': 1,
 'MED-3253': 1,
 'MED-1399': 1,
 'MED-1393': 1,
 'MED-1394': 1,
 'MED-1395': 1,
 'MED-1489': 1,
 'MED-1397': 1,
 'MED-1398': 1,
 'MED-1400': 1,
 'MED-1476': 1,
 'MED-2432': 1,
 'MED-1478': 1,
 'MED-1479': 1,
 'MED-1486': 1,
 'MED-1487': 1,
 'MED-1488': 1,
 'MED-1490': 1,
 'MED-1914': 1,
 'MED-1915': 1,
 'MED-1916': 1,
 'MED-1917': 1,
 'MED-1918': 1,
 'MED-1919': 1,
 'MED-1920': 1,
 'MED-1921': 1,
 'MED-1922': 1,
 'MED-1923': 1,
 'MED-1924': 1,
 'MED-4877': 1,
 'MED-1926': 1,
 'MED-4878': 1,
 'MED-1928': 1,
 'MED-1929': 1,
 'MED-1930': 1,
 'MED-1931': 1,
 'MED-1932': 1,
 'MED-1933': 1,
 'MED-1934': 1,
 'MED-1935': 1,
 'MED-1936': 1,
 'MED-2109': 1,
 'MED-2110': 1,
 'MED-2111': 1,
 'MED-2112': 1,
 'MED-4255': 1,
 'MED-3113': 1,
 'MED-4247': 1,
 'MED-3786': 1,
 'MED-37

In [None]:
!pip install pytrec_eval

Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308203 sha256=0f694825c963ab278e181f815b20fe8d3bc663d856f52879aa7eb2ed2ae19b35
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5


In [None]:
import pytrec_eval

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'P.3,5,10', 'recall.3,5,10', 'ndcg_cut.3,5,10', 'map_cut.3,5,10'})
result = evaluator.evaluate(top_documents)
print(result)
metrics = ['P','ndcg_cut', 'recall', 'map_cut']
cutoffs = [3,5,10]
scores = {f'{metric}_{cutoff}': 0 for metric in metrics for cutoff in cutoffs}
for key in result:
  for metric in metrics:
    for cutoff in cutoffs:
      scores[f'{metric}_{cutoff}'] += result[key][f'{metric}_{cutoff}']
run_length = len(test_dict)
for score in scores:
  scores[score] /= run_length

{'PLAIN-2': {'P_3': 0.0, 'P_5': 0.2, 'P_10': 0.3, 'recall_3': 0.0, 'recall_5': 0.041666666666666664, 'recall_10': 0.125, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.06560253875617089, 'ndcg_cut_10': 0.20235501202161513, 'map_cut_3': 0.0, 'map_cut_5': 0.008333333333333333, 'map_cut_10': 0.034126984126984124}, 'PLAIN-12': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0, 'ndcg_cut_10': 0.0, 'map_cut_3': 0.0, 'map_cut_5': 0.0, 'map_cut_10': 0.0}, 'PLAIN-23': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0, 'ndcg_cut_10': 0.0, 'map_cut_3': 0.0, 'map_cut_5': 0.0, 'map_cut_10': 0.0}, 'PLAIN-33': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0, 'ndcg_cut_10': 0.0, 'map_cut_3': 0.0, 'map_cut_5': 0.0, 'map_cut_10': 0.0}, 'PLAIN-44': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, '

In [None]:
scores

{'P_3': 0.07333333333333332,
 'P_5': 0.07200000000000002,
 'P_10': 0.06299999999999999,
 'ndcg_cut_3': 0.06509270624266404,
 'ndcg_cut_5': 0.06781480804965985,
 'ndcg_cut_10': 0.06947817861800881,
 'recall_3': 0.008243905007150757,
 'recall_5': 0.019261286940412185,
 'recall_10': 0.030525413933641075,
 'map_cut_3': 0.005779647248863783,
 'map_cut_5': 0.008848119404315392,
 'map_cut_10': 0.011960283548766291}