In [None]:
#The jupyter notebook can be downloaded using the link at top right corner of the kaggle webpage if not fully rendered in the webpage. 

import numpy as np
import gensim
import os
import re

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

from gensim.models.ldamulticore import LdaMulticore

import pandas as pd

In [None]:
!pip install scispacy

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz

In [None]:
#df = pd.read_csv('metadata.csv')
bucket = 'coviddata'
file = 'metadata.csv'
gcs_url = 'https://%(bucket)s.storage.googleapis.com/%(file)s' % {'bucket':bucket, 'file':file}
df = pd.read_csv(gcs_url)


In [None]:
df.head()

In [None]:
df2 = df.drop(columns = ['sha', 'source_x', 'pmcid', 'license', 'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_full_text'])

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
df3 = df2.dropna(subset=['abstract'])

In [None]:
df3.shape

In [None]:
df3.head()

In [None]:
import en_core_sci_md
nlp = en_core_sci_md.load(disable=["tagger", "parser", "ner"])
nlp.max_length = 2000000

In [None]:
import spacy

In [None]:
from spacy.tokenizer import Tokenizer

In [None]:
def tokenize(doc):
    
    return [token.text for token in nlp(doc) if not token.is_stop and not token.is_punct and not token.pos == 'PRON']

In [None]:
data = df3['abstract'].apply(tokenize)

In [None]:
data

In [None]:
vect = [nlp(doc).vector for doc in df3['abstract']]

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
nn = NearestNeighbors(n_neighbors=25, algorithm='ball_tree')
nn.fit(vect)

In [None]:
query = "chloroquine hydroxycholoroquine HCoV-19 SARS-CoV-2 coronavirus covid-19 treatment"

In [None]:
query_vect = nlp(query).vector

In [None]:
#find 10 most similar abstracts as the above query
similar_abstracts = nn.kneighbors([query_vect])[1]

In [None]:
for abstract in similar_abstracts:
    print(df3['abstract'].iloc[abstract])

In [None]:
output = pd.DataFrame((df3['abstract'].iloc[abstract]))


In [None]:
pd.set_option('display.max_colwidth', 0)
output.head(25)
#Output of the top 25 abstracts matching the query with index numbers

In [None]:
#From the above abstracts, abstracts 28684, 8950, 7683 appear relevant to our search for newer treatments.
#Abstract 4935 and 34889 is relevant to chloroquine in treating covid-19. 
#Abstract 18811 is relevant to a monoclonal antibody treatment against covid-19.
#Abstract 30643 is relevant to a new target Abelson tyrosine-protein kinase 2 (Abl2) against covid-19.
#Abstract 43973 is important in discussing various approaches towards developing a vaccine and treatments against covid-19.



In [None]:
#Next step will be to inspect the detailed papers for these abstracts.
#Let us inspect the abstracts first and raw some conclusions.

In [None]:
#pd.set_option('display.max_colwidth', 0)
query1 = output.iloc[ 10, : ]
query1.head()

Abstract 4935: Abstract We report on chloroquine, a 4-amino-quinoline, as an effective inhibitor of the replication of the severe acute respiratory syndrome coronavirus (SARS-CoV) in vitro. Chloroquine is a clinically approved drug effective against malaria. We tested chloroquine phosphate for its antiviral potential against SARS-CoV-induced cytopathicity in Vero E6 cell culture. Results indicate that the IC50 of chloroquine for antiviral activity (8.8±1.2μM) was significantly lower than its cytostatic activity; CC50 (261.3±14.5μM), yielding a selectivity index of 30. The IC50 of chloroquine for inhibition of SARS-CoV in vitro approximates the plasma concentrations of chloroquine reached during treatment of acute malaria. Addition of chloroquine to infected cultures could be delayed for up to 5h postinfection, without an important drop in antiviral activity. Chloroquine, an old antimalarial drug, may be considered for immediate use in the prevention and treatment of SARS-CoV infections.

Abstract 4935 key takeway:
Chloroquine reduced the antiviral activity of SARS-COV (2003 SARS outbreak) in in vitro study and could be an effective treatment for this infection.

In [None]:
query2 = output.iloc[ 19, : ]
query2.head()

Abstract 34889: Until recently, human coronaviruses (HCoVs), such as HCoV strain OC43 (HCoV-OC43), were mainly known to cause 15 to 30% of mild upper respiratory tract infections. In recent years, the identification of new HCoVs, including severe acute respiratory syndrome coronavirus, revealed that HCoVs can be highly pathogenic and can cause more severe upper and lower respiratory tract infections, including bronchiolitis and pneumonia. To date, no specific antiviral drugs to prevent or treat HCoV infections are available. We demonstrate that chloroquine, a widely used drug with well-known antimalarial effects, inhibits HCoV-OC43 replication in HRT-18 cells, with a 50% effective concentration (± standard deviation) of 0.306 ± 0.0091 μM and a 50% cytotoxic concentration (± standard deviation) of 419 ± 192.5 μM, resulting in a selectivity index of 1,369. Further, we investigated whether chloroquine could prevent HCoV-OC43-induced death in newborn mice. Our results show that a lethal HCoV-OC43 infection in newborn C57BL/6 mice can be treated with chloroquine acquired transplacentally or via maternal milk. The highest survival rate (98.6%) of the pups was found when mother mice were treated daily with a concentration of 15 mg of chloroquine per kg of body weight. Survival rates declined in a dose-dependent manner, with 88% survival when treated with 5 mg/kg chloroquine and 13% survival when treated with 1 mg/kg chloroquine. Our results show that chloroquine can be highly effective against HCoV-OC43 infection in newborn mice and may be considered as a future drug against HCoVs.

Note: HCoV strain OC43 (HCoV-OC43) is a different strain than COVID-19 (SARS-COV) but chloroquine seems to have a class effect against coronaviruses. 

****Abstract 43973 is an important review of vaccine and treatment approaches against COVID-19.

In [None]:
#The full text for this abstract can be accessed here : https://github.com/bs3537/DS-Unit-4-Sprint-1-NLP/blob/master/1-s2.0-S2090123220300540-main.pdf

In [None]:
#NLP Topic Modeling for the selected abstract above

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vect = TfidfVectorizer(stop_words='english', tokenizer = tokenize, ngram_range=(1,2))

In [None]:
tf = vect.fit_transform(output['abstract'])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda = LatentDirichletAllocation(n_components=50, random_state=0, n_jobs=-1)

In [None]:
lda.fit(tf)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [None]:
tfidf_feature_names = vect.get_feature_names()
top_words = print_top_words(lda, tfidf_feature_names, 25)
top_words

In [None]:
!pip install pyLDAvis

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

In [None]:
data = output['abstract'].apply(tokenize)

In [None]:
id2word = corpora.Dictionary(data)

In [None]:
corpus = [id2word.doc2bow(token) for token in data]

In [None]:
lda2 = LdaMulticore(corpus = corpus,
                   id2word = id2word,
                   random_state = 42,
                   num_topics = 15,
                   passes = 10,
                   workers = 4)

In [None]:
lda2.print_topics()

In [None]:
import re
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda2.print_topics()]

In [None]:
topics = [' '.join(t[0:10]) for t in words]

In [None]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

In [None]:
pyLDAvis.gensim.prepare(lda2, corpus, id2word)

In [None]:
#On hovering over 2019-nCoV, this word is most commonly present in topics 1 and 10. 
#COVID-19 word is most commonly present in topic 1. 