![](https://media.giphy.com/media/kcUYFhoCZwWF3fivnI/giphy.gif)

* <font size="5" color="blue">Contents</font>

* [Basic Exploratory data analysis](#1)
    * [Distribution of title length]()
    * [Which are most common words]()
    * [Topic model and visualization]()


* [Find similar papers easily](#2)
    * [Method 1 : USE]()
    * [Method 2 : DBSCAN]()
    
* [Mining Related Articles](#4)
    * [My approach and preprocessing]()
   
* [Keyword Extraction](#3)
    * [Method 1: Rake]()
    * [Method 2: TextRank ]()
    
    
* [Knowledge graphs](#5) 
    * [Task 1]()
    

## <font color='blue' size='4'>Please Leave an upvote if you like it ⬆️ <font size='3' color='red'>Thank you :)</font></font>

## [Loading Required Libraries]() <a id="1" ></a>

In [None]:
!pip install rake-nltk

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.cluster import DBSCAN
from nltk.corpus import stopwords
from spacy.matcher import Matcher 
from collections import  Counter
import matplotlib.pyplot as plt
from spacy.tokens import Span 
import tensorflow_hub as hub
from rake_nltk import Rake
import tensorflow as tf
import pyLDAvis.gensim
from tqdm import tqdm
import seaborn as sns
import networkx as nx
import pandas as pd
import numpy as np
import pyLDAvis
import gensim
import spacy
import os
import gc

## <font size="5" color="blue">Exploratory Data Analysis</font><a id="1" ></a>

In [None]:
path="../input/CORD-19-research-challenge/"
all_sources=pd.read_csv(path+"metadata.csv")

In [None]:
all_sources.isna().sum()

###  <font color='red' size='3'>Distribution of title length</font>

In [None]:
headline_length=all_sources['title'].str.len()
sns.distplot(headline_length)
plt.show()

### <font size='4' color='red'> Distribution of abstract length</font>

In [None]:
headline_length=all_sources['abstract'].str.len()
plt.hist(headline_length)
plt.show()

In [None]:
stop=set(stopwords.words('english'))

def build_list(df,col="title"):
    corpus=[]
    lem=WordNetLemmatizer()
    stop=set(stopwords.words('english'))
    new= df[col].dropna().str.split()
    new=new.values.tolist()
    corpus=[lem.lemmatize(word.lower()) for i in new for word in i if(word) not in stop]
    
    return corpus

### <font color='red' size="4">Which are most common words in title?</font>

In [None]:
corpus=build_list(all_sources)
counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:10]:
    if (word not in stop) :
        x.append(word)
        y.append(count)

In [None]:
plt.figure(figsize=(9,7))
sns.barplot(x=y,y=x)

### <font color='red' size='4'>Which are the most common words in abstracts ?</font>

In [None]:
corpus=build_list(all_sources,"abstract")
counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:10]:
    if (word not in stop) :
        x.append(word)
        y.append(count)
        
plt.figure(figsize=(9,7))
sns.barplot(x=y,y=x)

### <font size='4' color='red'>Which are the most common bi-grams in title?</font>

In [None]:
def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]



In [None]:
top_n_bigrams=get_top_ngram(all_sources['title'].dropna(),2)[:10]
x,y=map(list,zip(*top_n_bigrams))
plt.figure(figsize=(9,7))
sns.barplot(x=y,y=x)

### <font size='4' color='red'> Which are the most common tri-grams in title?</font>

In [None]:
top_tri_grams=get_top_ngram(all_sources['title'].dropna(),n=3)
x,y=map(list,zip(*top_tri_grams))
plt.figure(figsize=(9,7))
sns.barplot(x=y,y=x)

### <font color='red' size='4'> Topic modeling </font>

In [None]:
def preprocess_news(df):
    corpus=[]
    stem=PorterStemmer()
    lem=WordNetLemmatizer()
    for news in df['title'].dropna()[:5000]:
        words=[w for w in word_tokenize(news) if (w not in stop)]
        
        words=[lem.lemmatize(w) for w in words if len(w)>2]
        
        corpus.append(words)
    return corpus

In [None]:
corpus=preprocess_news(all_sources)
dic=gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]


In [None]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 4, 
                                   id2word = dic,                                    
                                   passes = 10,
                                   workers = 2)

In [None]:
lda_model.show_topics()

### <font color='red' size='4'>Let's visualize the topic models</font>

In [None]:

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
vis

In [None]:
del corpus,top_n_bigrams,lda_model,bow_corpus,top_tri_grams
gc.collect()

## <font size="5" color="blue">Find Similar Papers</font><a id="2"></a>

<font color='red' size='4'>Method 1: Using Universal sentence Encoder</font>
- Done using Universal Sentence encoder and consine similarity.

In [None]:

def prepare_similarity(vectors):
    similarity=cosine_similarity(vectors)
    return similarity

def get_top_similar(sentence, sentence_list, similarity_matrix, topN):
    # find the index of sentence in list
    index = sentence_list.index(sentence)
    # get the corresponding row in similarity matrix
    similarity_row = np.array(similarity_matrix[index, :])
    # get the indices of top similar
    indices = similarity_row.argsort()[-topN:][::-1]
    return [(i,sentence_list[i]) for i in indices]


In [None]:
module_url = "../input/universalsentenceencoderlarge4" 
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.load(module_url)

In [None]:


titles=all_sources['title'].fillna("Unknown")
embed_vectors=embed(titles[:100].values)['outputs'].numpy()
sentence_list=titles.values.tolist()
sentence=titles.iloc[5]
print("Find similar research papers for :")
print(sentence)

similarity_matrix=prepare_similarity(embed_vectors)
similar=get_top_similar(sentence,sentence_list,similarity_matrix,6)



In [None]:
for sentence in similar:
    print(sentence)
    print("\n")

In [None]:
del embed_vectors,sentence_list,similarity_matrix
gc.collect()

- <font color='red' size='4'>Method 2 : Cluster articles using DBSCAN</font>

In [None]:
nlp=spacy.load('en_core_web_sm')
sent_vecs={}
docs=[]

for i in tqdm(all_sources['title'].fillna('unknown')[:1000]):
    doc=nlp((i)) 
    docs.append(doc)
    sent_vecs.update({i :doc.vector})




In [None]:
sentences=list(sent_vecs.keys())
vectors=list(sent_vecs.values())


In [None]:
x=np.array(vectors)
dbscan=DBSCAN(eps=0.08, min_samples=2,metric='cosine' ).fit(x)
df_cluster=pd.DataFrame({'sentences':sentences,'label':dbscan.labels_})

In [None]:
df_cluster.label.unique()

- `-1` label indicates that the sentence does not belong to any cluster.

#### Some example clusters :

In [None]:
df_cluster[(df_cluster['label']==0)].head()

In [None]:
df_cluster[(df_cluster['label']==1)].head()

## <font size='5' color='blue'>Mining Related articles</font><a id='4'></a>

### What is known about transmission, incubation, and environmental stability?

- Seasonality of transmission.
- Persistence of virus on surfaces of different materials (e,g., copper, stainless steel, plastic).
- Natural history of the virus and shedding of it from an infected person
- Implementation of diagnostics and products to improve clinical processes
- Disease models, including animal models for infection, disease and transmission
- Tools and studies to monitor phenotypic change and potential adaptation of the virus
- Immune response and immunity
- Role of the environment in transmission

## <font size='4' color='red'>My Approach</font>
- First append articles from all the sources
- Choose a subtask from above task
- Find related articles
- Form a list of abstrat from these articles
- Then do keyword extraction to get important keyowords from these research papers.

In [None]:
path="../input/cord-19-eda-parse-json-and-generate-clean-csv/"


In [None]:
clean_comm=pd.read_csv(path+"clean_comm_use.csv",nrows=5000)
clean_comm['source']='clean_comm'
#clean_pmc=pd.read_csv(path+"clean_pmc.csv")
#clean_pmc['source']='clean_pmc'
biox = pd.read_csv(path+"biorxiv_clean.csv")
biox['source']='biorx'

all_articles=pd.concat([biox,clean_comm])

In [None]:
del biox,clean_comm
gc.collect()

In [None]:
all_articles.shape

- Define the tasks 

In [None]:
tasks=["What is known about transmission, incubation, and environmental stability",
      "What do we know about COVID-19 risk factors",
      "What do we know about virus genetics, origin, and evolution",
      "What do we know about vaccines and therapeutics",
      "What do we know about non-pharmaceutical interventions",
      "What do we know about diagnostics and surveillance",
      "What has been published about ethical and social science considerations",
      "Role of the environment in transmission",
      "Range of incubation periods for the disease in humans",
      "Prevalence of asymptomatic shedding and transmission",
      "Seasonality of transmission",
      "Persistence of virus on surfaces of different materials (e,g., copper, stainless steel, plastic)",
      "Susceptibility of populations",
      "Public health mitigation measures that could be effective for control",
      "Transmission dynamics of the virus",
      "Evidence that livestock could be infected",
      "Socioeconomic and behavioral risk factors for this spill-over",
      "Sustainable risk reduction strategies",
      "Resources to support skilled nursing facilities and long term care facilities",
      "Mobilization of surge medical staff to address shortages in overwhelmed communities"]

In [None]:
task_df=pd.DataFrame({'title':tasks,'source':'task'})

In [None]:
task_df.head()

In [None]:
all_articles=pd.concat([all_articles,task_df])
all_articles.fillna("Unknown",inplace=True)

- Find related research papers using  USE method.

In [None]:
sentence_list=all_articles.title.values.tolist()
embed_vectors=embed(sentence_list)['outputs'].numpy()
similarity_matrix=prepare_similarity(embed_vectors)


In [None]:
sentence= "Role of the environment in transmission"

similar=get_top_similar(sentence,sentence_list,similarity_matrix,10)

In [None]:
for sent in similar:
    print(sent[1])

- Clean and store abstracts from related articles.

In [None]:
ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])

In [None]:
import re
def clean(txt):
    txt=re.sub(r'\n','',txt)
    txt=re.sub(r'\([^()]*\)','',txt)
    txt=re.sub(r'https?:\S+\sdoi','',txt)
    return txt

In [None]:
texts=list(map(clean,texts))
text_list=' '.join(texts)
#text_list=word_tokenize(text_list)


- Now we can proceed to see two different `keyword extraction methods`.

## <font size='5' color='blue'> Keyword Extraction</font><a id='3'></a>

![](https://pcdn.piiojs.com/i/kqctmw/vw,671,vh,0,kc,1,r,0,pr,1,wp,1/https%3A%2F%2Fmonkeylearn.com%2Fstatic%2Fimg%2Fkeyword-extraction%2Fkeyword-extraction-intro%402x.png)

Keyword extraction is the automated process of extracting the most relevant words and expressions from text.Keyword extraction (also known as keyword detection or keyword analysis) is a text analysis technique that consists of automatically extracting the most important words and expressions in a text. It helps summarize the content of a text and recognize the main topics which are being discussed. 

### <font size='3' color='red'>Method 1:Rake</font>

In [None]:
!pip install python-rake

In [None]:
# Reka
import RAKE
import operator

# Reka setup with stopword directory
stop_dir = "../input/stopwordsforrake/SmartStoplist.txt"
rake_object = RAKE.Rake(stop_dir)

# Sample text to test RAKE


# Extract keywords
keywords = rake_object.run(text_list)


In [None]:
words,score=list(map(list,zip(*keywords)))
for word in (words[:10]):
    print(word)

### <font size='3' color='red'>Method 2: PytextRank</font>

In [None]:
!pip install pytextrank

In [None]:
import logging
import pytextrank
import spacy
import sys

In [None]:

nlp = spacy.load("en_core_web_sm")

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger("PyTR")

# add PyTextRank into the spaCy pipeline

tr = pytextrank.TextRank(logger=None)
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

# parse the document

doc = nlp(text_list)

print("pipeline", nlp.pipe_names)
print("elapsed time: {} ms".format(tr.elapsed_time))


# examine the top-ranked phrases in the document



In [None]:
for phrase in doc._.phrases[:10]:
    print("{}".format(phrase.text))
    #print(phrase.chunks)

## <font size='5' color='blue'>Knowledge Graph</font><a id='5'></a>

- Next,we will try and draw a knowledge graph from the related papers we found.

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [None]:
def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""

  #############################################################
  
    for tok in nlp(sent):
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
          # check: token is a compound word or not
          if tok.dep_ == "compound":
            prefix = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                   prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            modifier = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
              modifier = prv_tok_text + " "+ tok.text

          ## chunk 3
        if tok.dep_.find("subj") == True:
            ent1 = modifier +" "+ prefix + " "+ tok.text
            prefix = ""
            modifier = ""
            prv_tok_dep = ""
            prv_tok_text = ""      

          ## chunk 4
        if tok.dep_.find("obj") == True:
            ent2 = modifier +" "+ prefix +" "+ tok.text

          ## chunk 5  
          # update variables
        prv_tok_dep = tok.dep_
        prv_tok_text = tok.text
  #############################################################

    return [ent1.strip(), ent2.strip()]

In [None]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

### <font size='3' color='red'>Question : What is the Role of the environment in transmission?</font>

- We will prepare a dataframe that contains,subject,relation and object from these abstracts to plot the knowledge graph.


In [None]:
def prepare_df(text_list):
    doc=nlp(text_list)
    df=pd.DataFrame()
    for sent in list(doc.sents):
        sub,obj = get_entities(str(sent))
        relation= get_relation(str(sent))

        if ((len(relation)>2) & (len(sub)>2) &(len(obj)>2)):
            df=df.append({'subject':sub,'relation':relation,'object':obj},ignore_index=True)

    return df

In [None]:
df = prepare_df(text_list[24:])
df.head()

In [None]:

def draw_kg(pairs,c1='red',c2='blue',c3='orange'):
    k_graph = nx.from_pandas_edgelist(pairs, 'subject', 'object',
            create_using=nx.MultiDiGraph())
  
    node_deg = nx.degree(k_graph)
    layout = nx.spring_layout(k_graph, k=0.15, iterations=20)
    plt.figure(num=None, figsize=(50, 40), dpi=80)
    nx.draw_networkx(
        k_graph,
        node_size=[int(deg[1]) * 500 for deg in node_deg],
        arrowsize=20,
        linewidths=1.5,
        pos=layout,
        edge_color=c1,
        edgecolors=c2,
        node_color=c3,
        )
    labels = dict(zip(list(zip(pairs.subject, pairs.object)),
                  pairs['relation'].tolist()))
    nx.draw_networkx_edge_labels(k_graph, pos=layout, edge_labels=labels,
                                 font_color='red')
    plt.axis('off')
    plt.show()

In [None]:
draw_kg(df)

### <font size='3' color='red'>What is known about transmission, incubation, and environmental stability? </font>

- Get similar articles.

In [None]:
sentence= "What is known about transmission, incubation, and environmental stability"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)


- Prepare the abstract and title

In [None]:
ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])

In [None]:
texts=list(map(clean,texts))
text_list=' '.join(texts)

- Find the subject,object and relations and plot the KG.

In [None]:
df = prepare_df(text_list)
draw_kg(df)

### <font size='3' color='red'>What do we know about COVID-19 risk factors?</font>

In [None]:
sentence= "What do we know about COVID-19 risk factors"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,8)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

### <font size='3' color='red'>What do we know about virus genetics, origin, and evolution?
</font>

In [None]:
sentence= "What do we know about virus genetics, origin, and evolution"

similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

### <font size='3' color='red'>What do we know about vaccines and therapeutic?</font>

In [None]:
sentence= "What do we know about vaccines and therapeutics"

similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

### <font size='3' color='red'>What is the Role of the environment in transmission</font>

In [None]:
sentence= "Role of the environment in transmission"

similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

<font size='3' color='red'>What do we know about non-pharmaceutical interventions?</font>


In [None]:
sentence="What do we know about non-pharmaceutical interventions"

similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

### <font size='3' color='red'>What has been published about ethical and social science considerations?</font>


In [None]:
sentence= "What has been published about ethical and social science considerations"

similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

### <font size='3' color='red'>What do we know about diagnostics and surveillance?</font>


In [None]:
sentence="What do we know about diagnostics and surveillance"

similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

<font size='3' color='red'>What is known about the Range of incubation periods for the disease in humans?</font>

In [None]:
sentence="Range of incubation periods for the disease in humans"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

<font size='3' color='red'>What is known about Role of the environment in transmission?</font>

In [None]:
sentence="Role of the environment in transmission"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

<font size='3' color='red'>What is known about Seasonality of transmission</font>

In [None]:
sentence="Seasonality of transmission"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

* <font size='3' color='red'>What has been published about Prevalence of asymptomatic shedding and transmission</font>

In [None]:
sentence="Prevalence of asymptomatic shedding and transmission"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

<font size='3' color='red'>What is known about the Susceptibility of populations?</font>

In [None]:


sentence="Susceptibility of populations"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

<font size='3' color='red'>What is known about Public health mitigation measures that could be effective for control</font>

In [None]:
sentence="Public health mitigation measures that could be effective for control"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

![](http://)<font size='3' color='red'>What is known about the Transmission dynamics of the virus</font>

In [None]:
sentence= "Transmission dynamics of the virus"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

### <font size='3' color='red'>Do we have any Evidence that livestock could be infected?</font>


In [None]:
sentence= "Evidence that livestock could be infected"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

### <font size='3' color='red'>What is known about Socioeconomic and behavioral risk factors for this spill-over</font>


In [None]:
sentence= "Socioeconomic and behavioral risk factors for this spill-over"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

### <font size='3' color='red'>What are the Sustainable risk reduction strategies?</font>


In [None]:
sentence= "Sustainable risk reduction strategies"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

## <font size='3' color='red'>Resources to support skilled nursing facilities and long term care facilities</font>

In [None]:
sentence= "Resources to support skilled nursing facilities and long term care facilities"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

### <font size='3' color='red'>Mobilization of surge medical staff to address shortages in overwhelmed communities</font>

In [None]:
sentence= "Mobilization of surge medical staff to address shortages in overwhelmed communities"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
texts=list(map(clean,texts))
text_list=' '.join(texts)

df = prepare_df(text_list)
draw_kg(df,c1='blue',c2='pink',c3='green')

## <font color='blue' size='4'>Please Leave an upvote if you like it ⬆️ <font size='3' color='red'>Thank you :)</font></font>

### References
- https://networkx.github.io/documentation/stable/
- https://www.analyticsvidhya.com/blog/2019/10/how-to-build-knowledge-graph-text-using-spacy/