In [None]:
import numpy as np
import pandas as pd
import os
import spacy

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
topics = pd.read_csv('../input/trec-covid-information-retrieval/topics-rnd3.csv')
documents_metadata = pd.read_csv('../input/trec-covid-information-retrieval/CORD-19/CORD-19/metadata.csv')
docids = pd.read_csv('../input/trec-covid-information-retrieval/docids-rnd3.txt')

In [None]:
import string
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
def clear_df(text):
    text = text.apply(lambda x: [token for token in x if token not in string.punctuation and not token.isnumeric()])
    text = text.apply(lambda x: ''.join(x))
    text = text.apply(lambda x: [token.text.lower() for token in nlp(x) if (token.is_stop == False and len(token.text)>3)])
    text = text.apply(lambda x: ' '.join(x))
    text = text.apply(lambda x: [token.lemma_ for token in nlp(x)])
    text = text.apply(lambda x: ' '.join(x))
    return text

In [None]:
topics['query'] = clear_df(topics['query'])

In [None]:
qrels = pd.read_csv('../input/trec-covid-information-retrieval/qrels.csv')

In [None]:
texts_for_search = pd.DataFrame()
texts_for_search['cord_uid'] = documents_metadata['cord_uid']
texts_for_search['texts'] = documents_metadata['title'] + documents_metadata['abstract']

In [None]:
valid_documents = pd.merge(qrels, docids, left_on='cord-id',right_on='000ajevz', how="outer", indicator=True)
valid_documents_ids = valid_documents[valid_documents['_merge'] == 'right_only']['000ajevz']

In [None]:
texts_for_search = texts_for_search[texts_for_search['texts'].notna()]

In [None]:
valid_documents_data = pd.merge(valid_documents_ids,documents_metadata,left_on='000ajevz',right_on='cord_uid',how='right', indicator=True)
valid_documents_data[valid_documents_data['_merge'] != 'right_only']

In [None]:
valid_documents_data_clear = valid_documents_data[valid_documents_data['_merge'] != 'right_only']

In [None]:
searchfor = ['SARS-CoV-2', 'coronavirus', 'COVID', 'COVID-19']
valid_documents_data_clear_filtered = valid_documents_data_clear[valid_documents_data_clear.title.str.contains('|'.join(searchfor), na=False)]

In [None]:
valid_documents_data_clear_filtered.drop(['000ajevz', 'sha', 'publish_time', 'source_x', 'doi', 'pmcid', 'pubmed_id', 'license', 'authors', 'journal', 'mag_id', 'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files', 'url', 's2_id', '_merge'], axis=1, inplace=True)
valid_documents_data_clear_filtered

In [None]:
valid_documents_data_clear_filtered['text'] = valid_documents_data_clear_filtered['title'] + valid_documents_data_clear_filtered['abstract']

In [None]:
valid_documents_data_clear_filtered.drop(['title', 'abstract'], axis=1, inplace=True)

In [None]:
valid_documents_data_clear_filtered = valid_documents_data_clear_filtered.dropna()

In [None]:
!pip install hashedindex
import hashedindex

In [None]:
index = hashedindex.HashedIndex()

In [None]:
valid_documents_data_clear_filtered

In [None]:
for ind, row_docs in valid_documents_data_clear_filtered.iterrows():
    for ind, row_topics in topics.iterrows():
        for word in row_topics['query'].split():
            if word in row_docs['text']:
                index.add_term_occurrence(row_topics['query'], str(row_docs['cord_uid']))
            

In [None]:
index.items().keys()
index.get_documents('coronavirus response weather change').most_common(10)

In [None]:
results = pd.DataFrame(columns=['topic-id', 'cord-id'])
for ind, row_topics in topics.iterrows():
    for cord_id, occ in index.get_documents(row_topics['query']).most_common(10):
        results.loc[len(results)] = [row_topics['topic-id'],cord_id]


In [None]:
results.to_csv('submission.csv', index=False)