In [None]:
import os
import json
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import torch
from transformers import BertTokenizer, BertModel
from nltk import word_tokenize 

In [None]:
def removeStopwords(x):
    x = word_tokenize(x);
    x = [i for i in x if i not in stop_words]
    return ' '.join(x)   

In [None]:
def preprocess(data):
    data['selected_features'] = data['selected_features'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    data['selected_features'] = data['selected_features'].str.replace(r"\d+", "")
    data['selected_features'] = data['selected_features'].str.replace('[^\w\s]','')
    data['selected_features'] = data['selected_features'].str.replace(r"[︰-＠]", "")
    data['selected_features'] = data['selected_features'].apply(lambda x: removeStopwords(x))

The function get_word_encodings uses pretrained BERT model SciBERT which is trained on scientific data including PMC data. It gives out word embedding vectors for the input data X with output length 786. In interest of time and resources, the SciBERT has not been fine-tuned. But it is possible to fine tune it for the corpus formed from our dataset. Also, due to resource limitation, the encodings are processed for each record separately in a loop. However, with more memory, batches could be used to do the same.

In [None]:
def get_word_encodings(x):
    features = []
    model_version = 'allenai/scibert_scivocab_uncased'
    do_lower_case = True
    model = BertModel.from_pretrained(model_version)
    tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
    for i in x:
        inputs = tokenizer(i, return_tensors = "pt", padding = True)
        doc_embeddings = model(**inputs)
        feature  = doc_embeddings[0][:,0,:].cpu().detach().numpy().squeeze()
        features.append(feature)
    return np.array(features)

The following cell loads data from metadata.csv. In the interset of time and resources, the title and abstract of the files is only considered. The text in the data files can also be appended to these two features for future experiments. Here, in doc_embeddings, we store the encodings of 100 records. Due to processing time, complete data set has not been used as of now.

In [None]:
df_docs = pd.read_csv( '/kaggle/input/trec-covid-information-retrieval/CORD-19/CORD-19/metadata.csv')
df_docs['title'] = df_docs['title'].fillna('');
df_docs['abstract'] = df_docs['abstract'].fillna('')
df_title_abstract = pd.DataFrame(df_docs['title'] + " " + df_docs['abstract'], columns = ['selected_features'])
preprocess(df_title_abstract)
x = df_title_abstract['selected_features'].to_list()
doc_embeddings = get_word_encodings(x)

The following cell stores the embeddings of query + narrative for all topics. Experimentally, question can also be concatenated for the same.

In [None]:
df_query = pd.read_csv( '/kaggle/input/trec-covid-information-retrieval/topics-rnd3.csv')
df_query_narrative = pd.DataFrame(df_query['query']+" " + df_query['narrative'], columns = ['selected_features']);
preprocess(df_query_narrative)
x = df_query_narrative['selected_features'].to_list()
query_embedding = get_word_encodings(x)

Finally, dot product is used to calculate cosine similarity since the encoded vectors lie in similar range. Index of data frame is the document id and column names are topic ids.

In [None]:
similarity_matrix = np.dot(doc_embeddings, np.transpose(query_embedding))
similarity_df = pd.DataFrame(similarity_matrix,index = df_docs['cord_uid'].values, columns=df_query['topic-id'].values)

The similairy score for each topic is sorted in descending order and top 10 records are stored with doc ids and topic ids in submission.csv

In [None]:
information_retrieved_df = pd.DataFrame(columns=['topic_id','cord_id'])
row= 0;
for column in similarity_df.columns:
    for cordID in similarity_df.sort_values(column, ascending=False).head(1000).index:
        a = {}
        a['topic-id'] = column
        a['cord-id'] = cordID
        information_retrieved_df.loc[row, :] = dict(topic_id=column, cord_id=cordID)
        row+=1
information_retrieved_df.to_csv('submission.csv',index=False)