In [None]:
# load dataset
%pip install ir_datasets
import ir_datasets
dataset = ir_datasets.load("cord19/trec-covid")
print(f"queries: {dataset.queries_count()}, docs: {dataset.docs_count()}, qrels: {dataset.qrels_count()}")

queries: 50, docs: 192509, qrels: 69318


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# convert the collection in a dataframe
queries = pd.DataFrame(dataset.queries_iter())
docs = pd.DataFrame(dataset.docs_iter())
qrels = pd.DataFrame(dataset.qrels_iter())

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Preprocess function
def preprocess(text):
    # Convert to string if not already
    text = str(text)
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove punctuation and stopwords
    tokens = [t for t in tokens if t not in string.punctuation]
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    return ' '.join(tokens)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

queries_data = queries
docs_data = docs[:1000]

display(queries_data.sample(5))
display(docs_data.sample(5))


# Apply preprocessing to text columns
queries['title'] = queries_data['title'].apply(preprocess)
queries['description'] = queries_data['description'].apply(preprocess)
docs['title'] = docs_data['title'].apply(preprocess)
docs['abstract'] = docs_data['abstract'].apply(preprocess)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,query_id,title,description,narrative
25,26,coronavirus early symptoms,what are the initial symptoms of Covid-19?,Studies of patients and the first clinical man...
45,46,dexamethasone coronavirus,what evidence is there for dexamethasone as a ...,Looking for studies on the impact of dexametha...
48,49,post-infection COVID-19 immunity,do individuals who recover from COVID-19 show ...,There is concern about re-infection for COVID-...
37,38,COVID inflammatory response,What is the mechanism of inflammatory response...,Looking for a range of studies which describes...
10,11,coronavirus hospital rationing,what are the guidelines for triaging patients ...,Seeking information on any guidelines for prio...


Unnamed: 0,doc_id,title,doi,date,abstract
515,4prvgmvt,Nursing heroism in the 21(st )Century',10.1186/1472-6955-10-4,2011-02-16,BACKGROUND: The Vivian Bullwinkel Oration hono...
725,bbjmcdo5,True versus False Parasite Interactions: A Rob...,10.1371/journal.pone.0029618,2012-01-03,BACKGROUND: Multiple infections are common in ...
76,cxzlmfst,Automated identification of multiple micro-org...,10.1093/nar/gkl565,2006-09-29,There is an increasing recognition that detail...
522,0gt21051,Autonomous Targeting of Infectious Superspread...,10.1371/journal.pcbi.1002015,2011-03-17,"Infectious disease treatments, both pharmaceut..."
329,imxe4jeo,In Vitro Viability and Cytotoxicity Testing an...,10.2174/1875397300903010033,2009-06-11,In vitro cytotoxicity testing has become an in...


In [None]:
# Replace NaN values with an empty string in 'abstract'
docs['abstract'] = docs['abstract'].fillna('')

docs['title'] = docs['title'].fillna('')
# TF-IDF for key-phrase extraction
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(docs['title'])

# Get feature names (key-phrases)
feature_names = vectorizer.get_feature_names_out()

# Example: Print top 10 key-phrases for the first document
print(feature_names)

['15' '1918' '1999' '2001' '2006' '2007' '2008' '2009' '2010' '2011' '21'
 'abstracts' 'ace2' 'acid' 'acquired' 'across' 'activated' 'activation'
 'active' 'activity' 'acute' 'adaptive' 'adenovirus' 'adult' 'adults'
 'advances' 'affecting' 'africa' 'age' 'agent' 'aids' 'air' 'airway'
 'allograft' 'alternative' 'alveolar' 'america' 'amino' 'among'
 'amplification' 'analysis' 'and' 'anemia' 'angiotensin' 'anti'
 'antibodies' 'antibody' 'antigen' 'antigens' 'antisense' 'antiviral'
 'application' 'applications' 'approach' 'approaches' 'ards' 'array'
 'asia' 'assay' 'assessing' 'assessment' 'associated' 'association'
 'attenuated' 'attitudes' 'australia' 'australian' 'autoimmune'
 'autophagy' 'avian' 'bacterial' 'based' 'beijing' 'binding' 'biological'
 'biology' 'bird' 'blood' 'bocavirus' 'bocaviruses' 'body' 'bone' 'borne'
 'bovine' 'cancer' 'capacity' 'cardiac' 'cardiovascular' 'care' 'case'
 'cases' 'cationic' 'caused' 'cd8' 'ceacam1' 'cell' 'cells' 'cellular'
 'center' 'centre' 'centur

In [None]:
# Use the TF-IDF vectorizer from the first block
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(docs['title'])

# Get feature names (key-phrases)
feature_names = vectorizer.get_feature_names_out()

# Combine key phrases for each document
key_phrases_per_document = [set() for _ in range(len(docs))]
for i, row in enumerate(tfidf_matrix):
    # Get indices of non-zero elements (indicating presence of key phrases)
    indices = row.nonzero()[1]
    # Add corresponding key phrases to the set for the document
    key_phrases_per_document[i].update(feature_names[idx] for idx in indices)

# Create a dictionary to map query IDs to their respective key phrases
query_key_phrases = {query['query_id']: set(preprocess(query['title']).split()) for _, query in queries_data.iterrows()}

# Annotate queries based on the presence of key phrases
annotations = []
for query_id, key_phrases_query in query_key_phrases.items():
    # Check if any key phrase from the query is present in the document key phrases
    is_relevant = any(key_phrases_query.intersection(key_phrases_doc) for key_phrases_doc in key_phrases_per_document)
    annotations.append(1 if is_relevant else 0)

# Add the annotations to the DataFrame
queries_data["Annotations"] = annotations

In [None]:
from sklearn.metrics.pairwise import linear_kernel

# Compute TF-IDF matrix for queries
query_tfidf_matrix = vectorizer.transform(queries['title'])

# Compute cosine similarity between queries and documents
cosine_similarities = linear_kernel(query_tfidf_matrix, tfidf_matrix)

# Choose the number of top documents to keep
top_n = 1000  # Adjust this as needed

# Create a DataFrame to store the ranked list
ranked_list = pd.DataFrame(index=queries['query_id'], columns=range(top_n))

# Fill the DataFrame with top N document indices based on cosine similarity scores
for i, query_id in enumerate(queries['query_id']):
    # Get the indices of documents sorted by similarity score
    sorted_indices = np.argsort(cosine_similarities[i])[::-1][:top_n]
    # Assign the sorted indices to the corresponding row in the DataFrame
    ranked_list.loc[query_id] = sorted_indices

# Convert the values to numeric (since they might be stored as object)
ranked_list = ranked_list.apply(pd.to_numeric, errors='coerce')

# Display the ranked list
display(ranked_list.head())


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,765,310,14,571,489,192508,64175,64174,64173,64172,...,63216,63214,63261,63180,63189,63188,63187,63186,63185,63184
2,598,855,335,791,347,443,719,4,199,283,...,63225,63245,63246,63247,63248,63267,63266,63265,63264,63263
3,834,608,306,512,343,206,672,484,827,192,...,63248,63247,63246,63245,63244,63243,63218,63216,63263,63182
4,192508,64165,64174,64173,64172,64171,64170,64169,64168,64167,...,63186,63185,63184,63183,63182,63181,63180,63179,63177,63211
5,311,182,975,286,737,213,64168,64175,64174,64173,...,63241,63216,63214,63261,63180,63189,63188,63187,63186,63185


In [None]:
#queries_data = queries_data.drop(columns=['title'])

# Convert the indices in ranked_list to integers
ranked_list.index = ranked_list.index.astype(int)

# Filter out indices that are not present in queries_data
valid_indices = ranked_list.index.intersection(queries_data.index)

# Sort queries_data based on the filtered ranked list
sorted_queries_data = queries_data.loc[valid_indices]

# Save the sorted DataFrame to a CSV file
sorted_queries_data.to_csv("sorted_data.csv", sep=',', index=False, encoding='utf-8')

# Display the sorted DataFrame
display(sorted_queries_data)

Unnamed: 0,query_id,title,description,narrative,Annotations
1,2,coronavirus response weather changes,coronavirus respond changes weather,seeking range of information about the SARS-Co...,1
2,3,coronavirus immunity,sars-cov2 infected people develop immunity cro...,seeking studies of immunity developed due to i...,1
3,4,people die coronavirus,causes death covid-19,Studies looking at mechanisms of death from Co...,0
4,5,animal models covid-19,drugs active sars-cov sars-cov-2 animal studies,Papers that describe the results of testing d...,1
5,6,coronavirus test rapid testing,types rapid testing covid-19 developed,Looking for studies identifying ways to diagno...,1
6,7,serological tests coronavirus,serological tests detect antibodies coronavirus,Looking for assays that measure immune respons...,0
7,8,coronavirus reporting,lack testing availability led underreporting t...,Looking for studies answering questions of imp...,0
8,9,coronavirus canada,covid-19 affected canada,"seeking data related to infections (confirm, s...",0
9,10,coronavirus social distancing impact,social distancing impact slowing spread covid-19,seeking specific information on studies that h...,1
10,11,coronavirus hospital rationing,guidelines triaging patients infected coronavirus,Seeking information on any guidelines for prio...,1


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

threshold = 20  # Example threshold, you can experiment with different values

# Create a list to store binary predicted labels for each query
predicted_labels = []

# Iterate over the ranked list for each query
for query_id in ranked_list.index:
    # Get the indices of the top documents based on the threshold
    top_documents_indices = ranked_list.loc[query_id][:threshold].tolist()

    # Create a binary label for each document based on whether it's in the top documents or not
    binary_labels = [1 if i in top_documents_indices else 0 for i in range(len(docs))]

    # Append the binary labels for the current query to the overall list
    predicted_labels.extend(binary_labels)

# Convert the list to a NumPy array for further calculations
predicted_labels = predicted_labels[:50]
predicted_labels = np.array(predicted_labels)

# Assuming 'annotations' is the ground truth and 'predicted_labels' is your predicted labels
precision = precision_score(annotations, predicted_labels, zero_division=0)
recall = recall_score(annotations, predicted_labels)
f1 = f1_score(annotations, predicted_labels)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Precision: 1.0000
Recall: 0.0323
F1 Score: 0.0625
