In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import glob
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from  collections import OrderedDict
# Input data files are available in the "../Coronavirus_19" directory.
import os

In [None]:
meta=pd.read_csv("/Users/patsnap/Desktop/Neo4J_and_other_codes/Coronavirus_19/CORD-19-research-challenge/metadata.csv")
meta.head(2)

In [None]:
meta.columns

In [None]:
meta.shape

In [None]:
meta=meta[((meta['has_pdf_parse']==True) |(meta['has_pmc_xml_parse']==True))]
meta_sm=meta[['cord_uid','sha','pmcid','title','abstract','publish_time','url']]
meta_sm.drop_duplicates(subset ="title", keep = False, inplace = True)
meta_sm.loc[meta_sm.publish_time=='2020-12-31'] = "2020-03-31"
meta_sm.head(2)

In [None]:
sys.path.insert(0, "../")

root_path = '/Users/patsnap/Desktop/Neo4J_and_other_codes/Coronavirus_19/CORD-19-research-challenge/'
#inspired by this kernel. Thanks to the developer ref. https://www.kaggle.com/fmitchell259/create-corona-csv-file
# Just set up a quick blank dataframe to hold all these medical papers. 

df = {"paper_id": [], "text_body": []}
df = pd.DataFrame.from_dict(df)
df

In [None]:
collect_json = glob.glob(f'{root_path}/**/*.json', recursive=True)

for i,file_name in enumerate (collect_json):
    row = {"paper_id": None, "text_body": None}
    if i%10000==0:
        print ("====processed " + str(i)+ ' json files=====')
        print()

    with open(file_name) as json_data:
            
        data = json.load(json_data,object_pairs_hook=OrderedDict)
        
        row['paper_id']=data['paper_id']
        
        body_list = []
       
        for _ in range(len(data['body_text'])):
            try:
                body_list.append(data['body_text'][_]['text'])
            except:
                pass

        body = "\n ".join(body_list)
        
        row['text_body']=body 
        df = df.append(row, ignore_index=True)

In [None]:
df.shape

In [None]:
#merge metadata df with parsed json file based on sha_id
merge1=pd.merge(meta_sm, df, left_on='sha', right_on=['paper_id'])
merge1.head(2)

In [None]:
len(merge1)

In [None]:
#merge metadata set with parsed json file based on pcmid
merge2=pd.merge(meta_sm, df, left_on='pmcid', right_on=['paper_id'])
merge2.head(2)

In [None]:
print(len(merge2))
#combine merged sha_id and pcmid dataset, remove the duplicate values based on file name
merge_final= merge2.append(merge1, ignore_index=True)
merge_final.drop_duplicates(subset ="title", keep = False, inplace = True)
print(len(merge_final))
merge_final.head(2)

In [None]:
#remove articles that are not related to COVID-19 based on publish time
corona=merge_final[(merge_final['publish_time']>'2019-11-01') & (merge_final['text_body'].str.contains('nCoV|Cov|COVID|covid|SARS-CoV-2|sars-cov-2'))]
corona.shape

In [None]:
import re 
def clean_dataset(text):
    text=re.sub('[\[].*?[\]]', '', str(text))  #remove in-text citation
    text=re.sub(r'^https?:\/\/.*[\r\n]*', '',text, flags=re.MULTILINE)#remove hyperlink
    text=re.sub(r'\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b', '', text)#remove email
    text=re.sub(r'^a1111111111 a1111111111 a1111111111 a1111111111 a1111111111.*[\r\n]*',' ',text)#have no idea what is a11111.. is, but I remove it now
    text=re.sub(r'  +', ' ',text ) #remove extra space
    text=re.sub('[,\.!?]', '', text)
    text=re.sub(r's/ ( *)/\1/g','',text) 
    text=re.sub(r'[^\w\s]','',text) #strip punctions (recheck)
    return text

In [None]:
import warnings
warnings.filterwarnings('ignore')
corona['text_body'] =corona['text_body'].apply(clean_dataset)
corona['title'] =corona['title'].apply(clean_dataset)
corona['abstract'] =corona['abstract'].apply(clean_dataset)
corona['text_body'] = corona['text_body'].map(lambda x: x.lower())
coro=corona.reset_index(drop=True)
coro.head(2)

In [None]:
coro['count_abstract'] = coro['abstract'].str.split().map(len)
coro['count_abstract'].sort_values(ascending=True)
#check word count
y = np.array(coro['count_abstract'])
sns.distplot(y)

In [None]:
coro['count_text'] = coro['text_body'].str.split().map(len)
coro['count_text'].sort_values(ascending=True)
#check word count
import seaborn as sns
import matplotlib.pyplot as plt
y = np.array(coro['count_abstract'])
sns.distplot(y)

In [None]:
coro['count_text'] = coro['text_body'].str.split().map(len)
coro['count_text'].sort_values(ascending=True)

In [None]:
coro['count_text'].describe()

In [None]:
y = np.array(coro['count_text'])
sns.distplot(y)

In [None]:
coro2=coro[((coro['count_text']>500)&(coro['count_text']<4000))]
coro2.shape

In [None]:
coro2.to_csv("corona.csv",index=False)
#split articles w/o abstarct as the test dataset
test=coro2[coro2['count_abstract']<5]
test.head(2)
print(test.shape)

In [None]:
train= coro2.drop(test.index)
train.head(2)
print(train.shape)

In [None]:
train=train.reset_index(drop=True)
test=test.reset_index(drop=True)

# Bert extractive summarizer

https://pypi.org/project/bert-extractive-summarizer/

https://github.com/dmmiller612/bert-extractive-summarizer

# Bart summarization

https://gist.github.com/dizzySummer/0377bb6db284d3df45fdf75fe5394647#file-bart-summarization-ipynb

In [None]:
#!pip install transformers
#!pip install torch

In [None]:
import transformers
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
torch_device = 'cpu'

In [None]:
from transformers import pipeline
# load BART summarizer
summarizer = pipeline(task="summarization")

In [None]:
#bart = torch.hub.load('pytorch/fairseq', 'bart.large')
#bart.eval()  # disable dropout (or leave in train mode to finetune)

In [None]:
tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('bart-large-cnn')

In [None]:
abstract="introduction an epidemic of coronavirus disease 2019 (covid-19) began in december 2019 in china leading to a public health emergency of international concern (pheic). clinical, laboratory, and imaging features have been partially characterized in some observational studies. no systematic reviews on covid-19 have been published to date. methods we performed a systematic literature review with meta-analysis, using three databases to assess clinical, laboratory, imaging features, and outcomes of covid-19 confirmed cases. observational studies and also case reports, were included, and analyzed separately. we performed a random-effects model meta-analysis to calculate the pooled prevalence and 95% confidence interval (95%ci). results 660 articles were retrieved for the time frame (1/1/2020-2/23/2020). after screening, 27 articles were selected for full-text assessment, 19 being finally included for qualitative and quantitative analyses. additionally, 39 case report articles were included and analyzed separately. for 656 patients, fever (88.7%, 95%ci 84.5–92.9%), cough (57.6%, 40.8–74.4%) and dyspnea (45.6%, 10.9–80.4%) were the most prevalent manifestations. among the patients, 20.3% (95%ci 10.0–30.6%) required intensive care unit (icu), 32.8% presented with acute respiratory distress syndrome (ards) (95%ci 13.7–51.8), 6.2% (95%ci 3.1–9.3) with shock. some 13.9% (95%ci 6.2–21.5%) of hospitalized patients had fatal outcomes (case fatality rate, cfr). conclusion covid-19 brings a huge burden to healthcare facilities, especially in patients with comorbidities. icu was required for approximately 20% of polymorbid, covid-19 infected patients and hospitalization was associated with a cfr of over 13%. as this virus spreads globally, countries need to urgently prepare human resources, infrastructure and facilities to treat severe covid-19."

summary = summarizer(abstract, min_length=50, max_length=200)
print (summary)

In [None]:
train_summary = train.iloc[0:2,:]
train_summary["text_summary"] = train_summary["text_body"].apply(lambda x: summarizer(x, min_length=50, max_length=200))

# LDA topic modelling

In [None]:
#remove stop words
import gensim
from gensim.parsing.preprocessing import remove_stopwords

my_extra_stop_words = ['preprint','paper','copyright','case','also','moreover','use','from', 'subject', 're', 'edu', 'use','and','et','al','medrxiv','peerreviewed','peerreview','httpsdoiorg','license','authorfunder','grant','ccbyncnd','permission','grant','httpsdoiorg101101202002']

train['text_body']=train['text_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (my_extra_stop_words) and word not in gensim.parsing.preprocessing.STOPWORDS and len(word)>3]))

coronaRe=train.reset_index(drop=True)

In [None]:
import spacy
nlp=spacy.load("en_core_web_sm",disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text_out=[]
    for word in texts:
        data=nlp(word)
        data=[word.lemma_ for word in data]
        text_out.append(data)
    return text_out
coronaRe['new_lem'] = lemmatization(coronaRe['text_body'],allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
from gensim.corpora import Dictionary
docs = coronaRe['new_lem']
dictionary = Dictionary(docs)

# Filter out words that occur less than 10 documents, or more than 50% of the documents
dictionary.filter_extremes(no_below=10, no_above=0.5)

# Create Bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
coronaRe.head(2)

In [None]:
import gensim.corpora as corpora
# Create Dictionary
dictionary = gensim.corpora.Dictionary(coronaRe['new_lem'])
count = 0
for k, v in dictionary.iteritems():
    #print(k, v)
    count += 1
#less than 15 documents (absolute number) or more than 0.5 documents (fraction of total corpus size, not absolute number).after the above two steps, keep only the first 4500 most frequent tokens.
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=4500)
# Create Corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in coronaRe
              ['new_lem']]
bow_corpus_id=[ id for id in coronaRe['cord_uid']]
# View
#print(bow_corpus[:1])

In [None]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus, id2word=dictionary, num_topics=10, random_state=100, chunksize=100, passes=10, per_word_topics=True)

In [None]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
# Highest keyword probability is the topic 

In [None]:
lda_df = lda_model.get_document_topics(bow_corpus,minimum_probability=0)
lda_df = pd.DataFrame(list(lda_df))

num_topics = lda_model.num_topics

lda_df.columns = ['Topic'+str(i) for i in range(num_topics)]
for i in range(len(lda_df.columns)):
    lda_df.iloc[:,i]=lda_df.iloc[:,i].apply(lambda x: x[1])
lda_df['Automated_topic_id'] =lda_df.apply(lambda x: np.argmax(x),axis=1)
lda_df.head(2)

In [None]:
#coherence score https://stackoverflow.com/questions/54762690/coherence-score-0-4-is-good-or-bad
from gensim.models import CoherenceModel
# Compute Coherence Score
from tqdm import tqdm
coherenceList_cv=[]
num_topics_list = np.arange(5,26)
for num_topics in tqdm(num_topics_list):
    lda_model = gensim.models.LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=num_topics, random_state=100, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=coronaRe['new_lem'], coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    coherenceList_cv.append(coherence_lda)
print('\nCoherence Score: ', coherence_lda)

In [None]:
#re-do (not correct)
plotData = pd.DataFrame({'Number of topics':num_topics_list, 'CoherenceScore_cv':coherenceList_cv})
f,ax = plt.subplots(figsize=(10,6))
sns.set_style("darkgrid")
sns.pointplot(x='Number of topics', y= 'CoherenceScore_cv', data=plotData)
plt.title('Topic coherence')

In [None]:
#final model

Lda = gensim.models.LdaMulticore
lda_final= Lda(corpus=bow_corpus, num_topics=17,id2word = dictionary, passes=10,chunksize=100,random_state=100)

In [None]:
from pprint import pprint
# Print the Keyword in the 16 topics
pprint(lda_final.print_topics())
doc_lda = lda_final[corpus]

In [None]:
lda_df = lda_final.get_document_topics(bow_corpus,minimum_probability=0)
lda_df = pd.DataFrame(list(lda_df))
lda_id=pd.DataFrame(list(bow_corpus_id))
num_topics = lda_final.num_topics

lda_df.columns = ['Topic'+str(i) for i in range(num_topics)]

for i in range(len(lda_df.columns)):
    lda_df.iloc[:,i]=lda_df.iloc[:,i].apply(lambda x: x[1])

lda_df['Automated_topic_id'] =lda_df.apply(lambda x: np.argmax(x),axis=1)

lda_df['cord_uid']= lda_id
lda_df[39:40]

In [None]:
topic=lda_df[['Automated_topic_id','cord_uid']]
plot_topics=lda_df.Automated_topic_id.value_counts().reset_index()
plot_topics.columns=["topic_id","quantity"]
plot_topics[:5]

In [None]:
ax = sns.barplot(x="topic_id", y="quantity",  data=plot_topics)

In [None]:
coronaRe['topic_id']= topic['Automated_topic_id']
coronaRe.head(2)

# NER - Named Entity Recognition

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bionlp13cg_md-0.2.4.tar.gz

In [None]:
import spacy
from spacy import displacy
from collections import Counter

import en_ner_bionlp13cg_md
nlp = en_ner_bionlp13cg_md.load()
text = train['abstract'][2]
doc = nlp(text)
print(list(doc.sents))

In [None]:
print(doc.ents)

In [None]:
from spacy import displacy
displacy.render(next(doc.sents), style='dep', jupyter=True,options = {'distance': 110})

In [None]:
displacy.render(doc, style='ent')