In [None]:
import re
import numpy as np
import pandas as pd
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [None]:
# Load data and keep only relevant columns
metadata_path = '../input/CORD-19-research-challenge/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={'doi': str})
meta_df = meta_df[meta_df.publish_time>"2020-01-01"][['cord_uid','doi','title','abstract', 'publish_time', 'authors', 'journal', 'pdf_json_files']].reset_index(drop=True)
meta_df.rename(columns={'cord_uid':'paper_id'}, inplace=True)
meta_df.head(3)

In [None]:
print('{} Total papers in original df'.format(meta_df.shape[0]))
print('{} Papers with abstract'.format(len(meta_df[meta_df['abstract'].notnull()])))

In [None]:
# Minimal text cleaning
def minimal_clean_text(text):
    text = text.lower()
    text = text.replace('-', ' ')
    text = text.replace('–', ' ')
    return text

def preproc_text(text):
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'[^A-Za-z ]+', '', text) # remove non-alphabetical characters, might not be good?
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    cleaned_text = [lemmatizer.lemmatize(token) for token in word_tokens if token not in stop_words and len(token)>1]
    return cleaned_text

In [None]:
# Initial processing

meta_df = meta_df.dropna(subset=['abstract'])
meta_df['abstract'] = meta_df['abstract'].apply(minimal_clean_text)
meta_df.head(3)

In [None]:
# Get the metadata of those that mentioned long covid in their abstract

keywords = ['long covid',
            'post covid 19 syndrome',
            'post covid 19 condition',
            'post acute sequelae of covid 19',
            'chronic covid syndrome']

for keyword in keywords:
    meta_df[keyword.replace(' ', '_')] = meta_df['abstract'].apply(lambda x: True if keyword in x else False)
long_covid_query = '==True or '.join(keyword.replace(' ', '_') for keyword in keywords) + '==True'
long_covid_df = meta_df.query(long_covid_query).reset_index(drop=True)
long_covid_df['tokenized_abstract'] = long_covid_df['abstract'].apply(preproc_text)

print('{:.2f}% of the abstracts contain the long covid related phrases'.format(long_covid_df.shape[0]/meta_df.shape[0]*100))
print('{} papers in total'.format(long_covid_df.shape[0]))

Get SPECTER embeddings according to [this paper](https://arxiv.org/abs/2004.07180) using the API found [here](https://github.com/allenai/paper-embedding-public-apis#specter)

In [None]:
from typing import Dict, List
import json

import requests


URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16


def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    """Splits a longer list to respect batch size"""
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]


def embed(papers):
    embeddings_by_paper_id: Dict[str, List[float]] = {}

    for chunk in chunks(papers):
        # Allow Python requests to convert the data above to JSON
        response = requests.post(URL, json=chunk)

        if response.status_code != 200:
            raise RuntimeError("Sorry, something went wrong, please try later!")

        for paper in response.json()["preds"]:
            embeddings_by_paper_id[paper["paper_id"]] = paper["embedding"]

    return embeddings_by_paper_id

In [None]:
paper_list = long_covid_df[['paper_id', 'title','abstract']].to_dict(orient='records')
embeddings = embed(paper_list)

In [None]:
embeddings_df = pd.DataFrame.from_dict(embeddings, orient='index')
print('{} papers in total'.format(embeddings_df.shape[0]))
embeddings_df.head()

In [None]:
def plot_embeddings(df):
    ax = df.plot.bar(figsize=(10,1))
    ax.axes.set_xticklabels([])
    ax.axes.set_yticklabels([])

In [None]:
plot_embeddings(embeddings_df.iloc[0])

In [None]:
plot_embeddings(embeddings_df.iloc[1])