# Import libraries

In [102]:
import os
import json
import glob
import sys
import numpy as np 
import pandas as pd

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

import nltk
from nltk.corpus import wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *


## Load Data

In [103]:
# Just set up a quick blank dataframe to hold all these medical papers. 
corona_features = {"doc_id": [None], "source": [None], "title": [None],
                  "abstract": [None], "text_body": [None]}
corona_df = pd.DataFrame.from_dict(corona_features)
# dataframe now set up, lets grab all the json file names and for the same we can use the very handy glob library

json_filenames = glob.glob(f'C:/Data/DS NN/Data/CORD-19-research-challenge/**/*.JSON', recursive=True)

def return_corona_df(json_filenames, df, source):

    for file_name in json_filenames:

        row = {"doc_id": None, "source": None, "title": None,
              "abstract": None, "text_body": None}

        with open(file_name) as json_data:
            data = json.load(json_data)

            row['doc_id'] = data['paper_id']
            row['title'] = data['metadata']['title']

            # Now need all of abstract. Put it all in 
            # a list then use str.join() to split it
            # into paragraphs. 

            abstract_list = [data['abstract'][x]['text'] for x in range(len(data['abstract']) - 1)]
            abstract = "\n ".join(abstract_list)

            row['abstract'] = abstract


            
            body_list = []
            for _ in range(len(data['body_text'])):
                try:
                    body_list.append(data['body_text'][_]['text'])
                except:
                    pass

            body = "\n ".join(body_list)
            
            row['text_body'] = body
            
            # Now just add to the dataframe. 
            
            if source == 'b':
                row['source'] = "biorxiv_medrxiv"
            elif source == "c":
                row['source'] = "common_use_sub"
            elif source == "n":
                row['source'] = "non_common_use"
            elif source == "p":
                row['source'] = "pmc_custom_license"
            
            df = df.append(row, ignore_index=True)
    
    return df
    


In [106]:
# Use the dataframe from here using using this

research_df = return_corona_df(json_filenames, corona_df, 'b')
# Or can also download the research paper data from the CSV file using 3 columns (abstract, title, authoris)
#research_df.to_csv("paper.csv")
#research_df=pd.read_csv('Data/paper.csv', usecols=['doc_id','abstract','title'])

In [107]:
research_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29316 entries, 0 to 29315
Data columns (total 4 columns):
abstract          29316 non-null object
doc_id            29315 non-null object
title             25732 non-null object
abstract_words    29316 non-null int64
dtypes: int64(1), object(3)
memory usage: 916.2+ KB


In [108]:
research_df.head()

Unnamed: 0,abstract,doc_id,title,abstract_words
0,,,,0
1,word count: 194 22 Text word count: 5168 23 24...,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,175
2,,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,0
3,Infectious bronchitis (IB) causes significant ...,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",1601
4,Nipah Virus (NiV) came into limelight recently...,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,271


In [109]:
def new_len(x):
    if type(x) is str:
        return len(x.split())
    else:
        return 0
research_df["abstract_words"] = research_df["abstract"].apply(new_len)
word_count = research_df.query("abstract_words !=0 and abstract_words <500")["abstract_words"]

In [110]:
fig = ff.create_distplot(hist_data=[word_count],
                         group_labels=["All abstracts"],
                         colors=["blue"])
fig.update_layout(title_text="Abstract words", xaxis_title="Abstract words", template="simple_white", showlegend=False)
fig.show()

We can see on average abstract has around 200 words, next we will see if there is any positive or negative sentiments in the abstract ideally research papers should be neutral with no sentiments.

## Sentiment Analysis

In [111]:
def polar(x):
    if type(x) is str:
        return sia.polarity_scores(x)
    else:
        return 100

In [112]:
sia = SentimentIntensityAnalyzer()
polar_0 = [pol for pol in research_df["abstract"].apply(lambda x: polar(x)) if pol != 100]

In [113]:
fig = go.Figure(go.Histogram(x=[pols["neg"] for pols in polar_0 if pols["neg"] < 0.15], marker=dict(
        color='crimson'
    )))
fig.update_layout(xaxis_title="Negativity sentiment", title_text="Negativity sentiment", template="simple_white")
fig.show()

In [114]:
fig = go.Figure(go.Histogram(x=[pols["pos"] for pols in polar_0 if pols["pos"] < 0.15], marker=dict(
        color='LightSeaGreen'
    )))
fig.update_layout(xaxis_title="Positivity sentiment", title_text="Positivity sentiment", template="simple_white")
fig.show()

As per our hypothesis we can see in the above plots that there is no positive or negative sentiments as both sentiments has maximum probability mass at 0. Now let's see if all the papers has Neutrality sentiments.

In [115]:
fig = go.Figure(go.Histogram(x=[pols["neu"] for pols in polar_0], marker=dict(
        color='MediumPurple'
    )))
fig.update_layout(xaxis_title="Neutrality sentiment", title_text="Neutrality sentiment", template="simple_white")
fig.show()

From the above plot it is clear that that the neutrality sentiment distribution has a strong leftward skew which is in opposite to the negativity and positivity sentiment distributions. So we can say that research papers has no sentiments and has only facts which is good and as expected.

## Latend Dirichlet Allocation

In [116]:
research_df[['abstract']] = research_df[['abstract']].fillna('')
data_text = research_df[['abstract']]
data_text['index'] = data_text.index
documents = data_text

In [117]:
len(documents)

29316

In [118]:
documents[:5]

Unnamed: 0,abstract,index
0,,0
1,word count: 194 22 Text word count: 5168 23 24...,1
2,,2
3,Infectious bronchitis (IB) causes significant ...,3
4,Nipah Virus (NiV) came into limelight recently...,4


In [119]:
stemmer = SnowballStemmer("english")

In [120]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [121]:
doc_sample = documents[documents['index'] == 3410].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['is', 'not', 'known', 'whether', 'they', 'edit', 'RNA', 'genomes', 'through', 'cytidine', 'deamination.', 'Here,', 'we', 'investigated', 'APOBEC3-mediated', 'restriction', 'of', 'Coronaviridae.', 'In', 'experiments', 'in', 'vitro,', 'three', 'human', 'APOBEC3', 'proteins', '(A3C,', 'A3F', 'and', 'A3H)', 'inhibited', 'HCoV-NL63', 'infection', 'and', 'limited', 'production', 'of', 'progeny', 'virus,', 'but', 'did', 'not', 'cause', 'hypermutation', 'of', 'the', 'coronaviral', 'genome.', 'APOBEC3-mediated', 'restriction', 'was', 'partially', 'dependent', 'on', 'enzyme', 'activity,', 'and', 'was', 'reduced', 'by', 'the', 'use', 'of', 'enzymatically', 'inactive', 'APOBEC3.', 'Moreover,', 'APOBEC3', 'proteins', 'bound', 'to', 'the', 'coronaviral', 'nucleoprotein,', 'and', 'this', 'interaction', 'also', 'affected', 'viral', 'replication.', 'Although', 'the', 'precise', 'molecular', 'mechanism', 'of', 'deaminasedependent', 'inhibition', 'of', 'coronavirus', 'replication', '

In [122]:
processed_docs = documents['abstract'].map(preprocess)
processed_docs[:20]

0                                                    []
1     [word, count, text, word, count, author, funde...
2                                                    []
3     [infecti, bronchiti, caus, signific, econom, l...
4     [nipah, virus, come, limelight, recent, outbre...
5                                                    []
6     [face, current, larg, scale, public, health, e...
7     [virus, interact, hundr, thousand, protein, ma...
8                                                    []
9                                                    []
10                                                   []
11    [human, astrovirus, small, nonenvelop, virus, ...
12    [ribosom, frameshift, translat, implic, human,...
13    [recent, outbreak, infect, novel, coronavirus,...
14    [note, logist, growth, regress, model, estim, ...
15                                                   []
16                                                   []
17    [outbreak, novel, coronavirus, name, covid

In [123]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [124]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 100:
        break

0 abstract
1 acetyl
2 act
3 allow
4 analys
5 analysi
6 approxim
7 author
8 characteris
9 compris
10 confirm
11 contain
12 contribut
13 count
14 delet
15 determin
16 diseas
17 domain
18 element
19 entri
20 essenti
21 experi
22 extens
23 facilit
24 flank
25 fmdv
26 foot
27 frame
28 function
29 funder
30 genom
31 hydroxyl
32 import
33 includ
34 initi
35 intern
36 investig
37 ire
38 larg
39 length
40 loop
41 mouth
42 mutat
43 nucleotid
44 open
45 permiss
46 picornavirus
47 poli
48 posit
49 predict
50 previous
51 primer
52 pseudoknot
53 read
54 region
55 repeat
56 replic
57 reserv
58 reus
59 ribosom
60 right
61 select
62 seri
63 shape
64 show
65 singl
66 site
67 stem
68 strand
69 structur
70 tandem
71 termin
72 text
73 tract
74 translat
75 untransl
76 unusu
77 utr
78 variabl
79 virus
80 word
81 abil
82 absenc
83 abund
84 accessori
85 accur
86 accuraci
87 acid
88 adapt
89 add
90 addit
91 agaros
92 agencourt
93 agent
94 align
95 altern
96 amino
97 amplicon
98 ampliconbas
99 amplif
100 amplifi

In [125]:
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=1000000)

In [126]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[3410]

[(17, 1),
 (18, 1),
 (21, 1),
 (30, 7),
 (36, 1),
 (56, 3),
 (61, 1),
 (65, 2),
 (79, 3),
 (87, 1),
 (95, 1),
 (120, 2),
 (183, 2),
 (187, 3),
 (207, 6),
 (209, 1),
 (214, 1),
 (223, 1),
 (243, 1),
 (264, 1),
 (265, 1),
 (296, 1),
 (297, 1),
 (301, 1),
 (302, 4),
 (304, 1),
 (324, 2),
 (373, 1),
 (378, 1),
 (387, 1),
 (390, 1),
 (418, 1),
 (436, 4),
 (458, 1),
 (468, 1),
 (475, 1),
 (483, 4),
 (486, 1),
 (496, 2),
 (505, 1),
 (507, 1),
 (509, 2),
 (528, 2),
 (539, 1),
 (584, 1),
 (600, 1),
 (615, 1),
 (643, 1),
 (665, 1),
 (685, 1),
 (699, 1),
 (754, 1),
 (765, 1),
 (778, 3),
 (787, 1),
 (820, 1),
 (843, 1),
 (856, 1),
 (860, 3),
 (874, 2),
 (875, 1),
 (908, 1),
 (930, 1),
 (991, 1),
 (1044, 1),
 (1052, 1),
 (1053, 1),
 (1063, 1),
 (1078, 1),
 (1126, 1),
 (1155, 1),
 (1236, 1),
 (1424, 1),
 (1509, 1),
 (1649, 1),
 (1719, 1),
 (1808, 2),
 (1837, 2),
 (1931, 2),
 (1972, 1),
 (2160, 1),
 (2251, 1),
 (2399, 1),
 (2438, 1),
 (2576, 1),
 (2603, 1),
 (2773, 1),
 (2861, 1),
 (2898, 1),
 (3049,

In [127]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 16 ("diseas") appears 2 time.
Word 119 ("case") appears 2 time.
Word 166 ("develop") appears 1 time.
Word 209 ("genet") appears 1 time.
Word 343 ("similar") appears 1 time.
Word 383 ("vaccin") appears 1 time.
Word 403 ("concern") appears 1 time.
Word 406 ("death") appears 1 time.
Word 420 ("literatur") appears 1 time.
Word 441 ("sever") appears 1 time.
Word 485 ("immun") appears 1 time.
Word 501 ("associ") appears 2 time.
Word 505 ("cell") appears 1 time.
Word 528 ("infect") appears 3 time.
Word 533 ("knowledg") appears 1 time.
Word 806 ("syndrom") appears 1 time.
Word 830 ("consequ") appears 2 time.
Word 837 ("defect") appears 1 time.
Word 859 ("induc") appears 1 time.
Word 883 ("patient") appears 4 time.
Word 923 ("lymphocyt") appears 1 time.
Word 941 ("present") appears 1 time.
Word 961 ("pneumonia") appears 1 time.
Word 965 ("year") appears 1 time.
Word 1001 ("character") appears 1 time.
Word 1191 ("outcom") appears 2 time.
Word 1209 ("secondari") appears 3 time.
Word 1510 ("p

Running LDA using Bag of Words

In [128]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [129]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.026*"sequenc" + 0.013*"genom" + 0.013*"gene" + 0.009*"method" + 0.007*"virus" + 0.007*"structur" + 0.007*"base" + 0.007*"protein" + 0.007*"acid" + 0.006*"detect"
Topic: 1 
Words: 0.024*"virus" + 0.021*"antibodi" + 0.014*"infect" + 0.014*"cell" + 0.013*"protein" + 0.010*"vaccin" + 0.009*"strain" + 0.009*"neutral" + 0.008*"viral" + 0.008*"pedv"
Topic: 2 
Words: 0.014*"infect" + 0.013*"cell" + 0.008*"calv" + 0.008*"mice" + 0.008*"patient" + 0.008*"diseas" + 0.007*"lung" + 0.007*"group" + 0.007*"signific" + 0.006*"clinic"
Topic: 3 
Words: 0.018*"vaccin" + 0.012*"protein" + 0.011*"cell" + 0.010*"infect" + 0.010*"virus" + 0.009*"drug" + 0.009*"develop" + 0.008*"human" + 0.008*"target" + 0.007*"immun"
Topic: 4 
Words: 0.015*"covid" + 0.014*"activ" + 0.013*"research" + 0.012*"public" + 0.011*"sourc" + 0.009*"right" + 0.009*"databas" + 0.009*"resourc" + 0.009*"origin" + 0.009*"elsevi"
Topic: 5 
Words: 0.022*"patient" + 0.016*"infect" + 0.016*"respiratori" + 0.013*"studi" + 0.

In [130]:
processed_docs[3410]

['know',
 'edit',
 'genom',
 'cytidin',
 'deamin',
 'investig',
 'apobec',
 'mediat',
 'restrict',
 'coronavirida',
 'experi',
 'vitro',
 'human',
 'apobec',
 'protein',
 'inhibit',
 'hcov',
 'infect',
 'limit',
 'product',
 'progeni',
 'virus',
 'caus',
 'hypermut',
 'coronavir',
 'genom',
 'apobec',
 'mediat',
 'restrict',
 'partial',
 'depend',
 'enzym',
 'activ',
 'reduc',
 'enzymat',
 'inact',
 'apobec',
 'apobec',
 'protein',
 'bind',
 'coronavir',
 'nucleoprotein',
 'interact',
 'affect',
 'viral',
 'replic',
 'precis',
 'molecular',
 'mechan',
 'inhibit',
 'coronavirus',
 'replic',
 'remain',
 'elus',
 'result',
 'understand',
 'apobec',
 'mediat',
 'restrict',
 'virus',
 'infect',
 'apolipoprotein',
 'mrna',
 'edit',
 'enzym',
 'catalyt',
 'polypeptid',
 'like',
 'type',
 'apobec',
 'protein',
 'depend',
 'cytidin',
 'deaminas',
 'belong',
 'apobec',
 'superfamili',
 'apobec',
 'enzym',
 'consid',
 'intrins',
 'defens',
 'eukaryot',
 'cell',
 'inhibit',
 'replic',
 'virus',
 '

In [131]:
for index, score in sorted(lda_model[bow_corpus[3410]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5296405553817749	 
Topic: 0.027*"cell" + 0.021*"protein" + 0.019*"virus" + 0.015*"infect" + 0.012*"viral" + 0.010*"activ" + 0.008*"membran" + 0.008*"host" + 0.007*"replic" + 0.007*"express"

Score: 0.22443710267543793	 
Topic: 0.026*"sequenc" + 0.013*"genom" + 0.013*"gene" + 0.009*"method" + 0.007*"virus" + 0.007*"structur" + 0.007*"base" + 0.007*"protein" + 0.007*"acid" + 0.006*"detect"

Score: 0.13446664810180664	 
Topic: 0.024*"virus" + 0.021*"antibodi" + 0.014*"infect" + 0.014*"cell" + 0.013*"protein" + 0.010*"vaccin" + 0.009*"strain" + 0.009*"neutral" + 0.008*"viral" + 0.008*"pedv"

Score: 0.07232178747653961	 
Topic: 0.034*"virus" + 0.025*"infect" + 0.017*"human" + 0.012*"respiratori" + 0.012*"detect" + 0.010*"diseas" + 0.010*"coronavirus" + 0.008*"pathogen" + 0.008*"sampl" + 0.007*"mer"

Score: 0.0355839878320694	 
Topic: 0.014*"infect" + 0.013*"cell" + 0.008*"calv" + 0.008*"mice" + 0.008*"patient" + 0.008*"diseas" + 0.007*"lung" + 0.007*"group" + 0.007*"signific" + 0.

In [132]:
unseen_document = "The global impact of COVID-19 has been profound, and the public health threat it represents is the most serious seen in a respiratory virus since the 1918 H1N1 influenza pandemic. Here we present theresults of epidemiological modelling which has informed policymaking in the UK and other countries in recent weeks. In the absence of a COVID-19 vaccine, we assess the potential role of a number of public health measures – so-called non-pharmaceutical interventions (NPIs) – aimed at reducing contact rates in the population and thereby reducing transmission of the virus. In the results presented here, we apply a previously published microsimulation model to two countries: the UK (Great Britain specifically) and the US. We conclude that the effectiveness of any one intervention in isolation is likely to be limited, requiring multiple interventions to be combined to have a substantial impact on transmission"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.5408318042755127	 Topic: 0.014*"case" + 0.013*"model" + 0.009*"effect" + 0.009*"studi" + 0.008*"diseas"
Score: 0.44685491919517517	 Topic: 0.018*"health" + 0.015*"diseas" + 0.007*"develop" + 0.006*"public" + 0.006*"model"
