 # Project description

In response to the COVID-19 pandemic, the White House and a coalition of leading research groups have prepared the COVID-19 Open Research Dataset (CORD-19). CORD-19 is a resource of over 200,000 scholarly articles, including over 100,000 with full text, about COVID-19, SARS-CoV-2, and related coronaviruses. This freely available dataset is provided to the global research community to apply recent advances in natural language processing and other AI techniques to generate new insights in support of the ongoing fight against this infectious disease. There is a growing urgency for these approaches because of the rapid acceleration in new coronavirus literature, making it difficult for the medical research community to keep up.

## Importing and Installing Libraries

In [None]:

#File processing
import zipfile
import numpy as np
import pandas as pd
import glob
import json



In [None]:
!pip install scispacy

In [None]:
import scispacy

In [None]:
!pip install en_core_sci_md

In [None]:
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz

In [None]:
import en_core_sci_md

In [None]:
!python -m spacy download en

## Loading and transforming the files into a data frame

Reference for some of the code in this section: [COVID EDA Initial Exploration Tool](https://www.kaggle.com/ivanegapratama/covid-eda-initial-exploration-tool)

### Loading Metadata


In [None]:
!ls /kaggle/input/CORD-19-research-challenge

### Reading metadata

In [None]:
root_path = '/kaggle/input/CORD-19-research-challenge'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

In [None]:
meta_df.shape

### Creating dataframe

In [None]:
corona_features = {'paper_id': [], 'title': [],
                   'abstract': [], 'text': []}

### Loading Json Data

In [None]:
root_path='../input/CORD-19-research-challenge/document_parses/pdf_json'

In [None]:
corona_df = pd.DataFrame.from_dict(corona_features)

In [None]:
json_filenames = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(json_filenames)

In [None]:
print(json_filenames)

In [None]:
len(json_filenames)

In [None]:
def return_corona_df(json_filenames, df):
  for file_name in json_filenames:
    row = {'paper_id': None, 'title': None,
           'abstract': None, 'text': None}
    
    with open(file_name) as json_data:
     # if file_name == './sample_data/anscombe.json':
      #  continue

      data = json.load(json_data)

      row['paper_id'] = data['paper_id'].strip() # ' 345 ' -> '345'
      row['title'] = data['metadata']['title'].strip()

      abstract_list = [abstract['text'] for abstract in data['abstract']]
      abstract = '\n '.join(abstract_list)
      row['abstract'] = abstract.strip()

      text_list = [text['text'] for text in data['body_text']]
      text = '\n '.join(text_list)
      row['text'] = text.strip()

      df = df.append(row, ignore_index = True)
  return df

In [None]:
corona_df = return_corona_df(json_filenames, corona_df)

In [None]:
corona_df.shape

In [None]:
corona_df.head()

## Text Analysis

In [None]:

#Visualization
import seaborn as sns
from matplotlib import pyplot as plt

# Text Analysis
import nltk
from IPython.core.display import HTML
import spacy

#### Getting a text sample

In [None]:
corona_df['text'][0]

### Cleaning the file

#### Checking for null values

In [None]:
corona_df.isnull().sum()

In [None]:
len(corona_df[corona_df['paper_id'] == ''])

In [None]:
len(corona_df[corona_df['title'] == ''])

In [None]:
len(corona_df[corona_df['abstract'] == ''])

In [None]:
len(corona_df[corona_df['text'] == ''])

In [None]:
corona_df = corona_df[corona_df['title'] != '']

In [None]:
corona_df = corona_df[corona_df['abstract'] != '']

In [None]:
corona_df.shape

#### Checking for duplicates

In [None]:
corona_df.drop_duplicates(['abstract', 'text', 'title'], inplace = True)

In [None]:
corona_df.shape

In [None]:
sample_text = corona_df['text'][5000]
sample_text

#### Taking a sample to analyse quickly

In [None]:
#corona_df = corona_df.sample(n = 500, random_state=1)

In [None]:
#corona_df.shape

### NLP Pre Processing the information

In [None]:
nlp = en_core_sci_md.load(disable=['tagger', 'parser', 'ner'])
nlp.max_length = 2000000

In [None]:
print(spacy.lang.en.stop_words.STOP_WORDS)

In [None]:
len(spacy.lang.en.stop_words.STOP_WORDS)

In [None]:
new_stop_words = ['et', 'al', 'doi', 'copyright', 'http', 'https', 'fig', 'table', 'result', 'show']
for word in new_stop_words:
  nlp.vocab[word].is_stop = True

#### Extracting word radicals with lemming  

In [None]:
def spacy_tokenizer(sentence):
  sentence = sentence.lower()
  list = []
  list = [word.lemma_ for word in nlp(sentence) if not (word.is_stop or
                                                        word.like_num or
                                                        word.is_punct or
                                                        word.is_space or
                                                        len(word) == 1)]
  list = ' '.join([str(element) for element in list])

  return list

In [None]:
sample_text = corona_df['text'][5000]
sample_text

#### Removing numbers

In [None]:
test = '1 ' + sample_text
test

#### Removing punctuation

In [None]:
result = spacy_tokenizer(test)
result

In [None]:
corona_df['text'] = corona_df['text'].apply(spacy_tokenizer)

In [None]:
len(sample_text)

In [None]:
print(corona_df['text'][5000])

In [None]:
len(corona_df['text'][5000])

### Treatment of Frequent Terms

In [None]:
for index, row in corona_df.iterrows():
  print(row['paper_id'], row['title'])
  text_file = open('corpus/' + row['paper_id'] + '.txt', 'w')
  n = text_file.write(row['text'])
  text_file.close()

In [None]:
from nltk.corpus import PlaintextCorpusReader
corpus = PlaintextCorpusReader('corpus', '.*')

In [None]:
files = corpus.fileids()

In [None]:
files[0]

In [None]:
corpus.raw('00b88130d2a7c8489e209742494303b6731d7544.txt')

In [None]:
words = corpus.words()
print(words)

In [None]:
len(words)

In [None]:
frequency = nltk.FreqDist(words)
most_common = frequency.most_common(100)
most_common

### Visualizing with wordcloud

In [None]:
from matplotlib.colors import ListedColormap
color_map = ListedColormap(['purple', 'goldenrod', 'red', 'royalblue'])

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(background_color = 'dark', max_words=100, colormap=color_map)

In [None]:
cloud = cloud.generate(corona_df['text'].str.cat(sep='\n'))
plt.figure(figsize=(15,15))
plt.imshow(cloud)
plt.axis('off')
plt.show()

In [None]:
corona_df.to_csv('corona_df.csv')

### Extraction of entities

#### Testing the function

In [None]:
#Selecting specific text
text = str(corona_df['text'][10644])
print(text)

In [None]:
nlp_ent = spacy.load('en')
nlp_ent.max_length = 2000000

In [None]:
doc = nlp_ent(text)

In [None]:
for entity in doc.ents:
  if entity.label_ == 'NORP' or entity.label_ == 'GPE':
    print(entity.text, entity.label_)

In [None]:
from spacy import displacy
displacy.render(doc, style = 'ent', jupyter = True)

#### Counting entities

In [None]:
#Which are the most cited countries in this dataset?
gpe = []
for index, row in corona_df.iterrows():
  text = row['text']
  doc = nlp_ent(text)
  for entity in doc.ents:
    if entity.label_ == 'GPE':
      gpe.append(str(entity.text))

In [None]:
print(gpe)

In [None]:
values_gpe, counts_gpe = np.unique(np.array(gpe), return_counts = True)

In [None]:
gpe_df = pd.DataFrame({'value': values_gpe, 'counts': counts_gpe})

In [None]:
gpe_df.head()

In [None]:
gpe_df.shape

In [None]:
gpe_df_filtered = gpe_df[gpe_df.counts > 50]

In [None]:
gpe_df_filtered.shape

In [None]:
gpe_df_filtered.head(16)

In [None]:
sns.set(rc={'figure.figsize': (15,8)})
sns.barplot(x = 'value', y = 'counts', hue='value', data=gpe_df_filtered);

## Text for Search

### Searching with NLTK

In [None]:
text = nltk.Text(corpus.words())

Checking if any results were found for the word 'pulmonary':

In [None]:
match = text.concordance('pulmonary', width = 150, lines=30)

Searching with NLTK has a lot of restrictions. The code above will only find the exact result, ignoring similar words. This is a very simple way of solving the challenge. 

### Searching with 'find'

#### 'Find' method

In [None]:
string = 'spread wuhan city china infect traveller cause sporadic secondary transmission city secondary city epidemic'
search_string = 'city'

In [None]:
print(string.find(search_string))

In [None]:
string[13]

In [None]:
string[13:13+10]

In [None]:
string[13-10:13]

In [None]:
string[13:13+10000]

Código baseado em: https://www.journaldev.com/23666/python-string-find

In [None]:
def find_texts(input_str, search_str, number_of_words):
  l = []
  index = 0
  number_of_words = number_of_words
  while index < len(input_str):
    i = input_str.find(search_str, index)
    if i == -1:
      return l
    
    if input_str[i-number_of_words:i] == '':
      start = 0
    else:
      start = i - number_of_words

    l.append(input_str[start:i] + input_str[i:i+number_of_words])
    index = i + 1
  return l

In [None]:
texts = find_texts(string, search_string, 50)
texts

Creating a visualization with html:

In [None]:
display(HTML(f'<h1>{search_string.upper()}</h1>'))
display(HTML(f"""<p><strong>Number of matches:</strong> {len(texts)}</p>"""))
for i in texts:
  #print(i)
  marked_text = str(i.replace(search_string, f"<mark>{search_string}</mark>"))
  #print(marked_text)
  display(HTML(f"""<blockquote>... {marked_text} ...</blockquote>"""))

#### Implementing on the dataset

In [None]:
search_string = 'pulmonary disease'

In [None]:
search_string = spacy_tokenizer(search_string)
search_string

In [None]:
def find_all_texts(input_str, search_str, number_of_words):
  text_list = []
  index = 0
  number_of_words = number_of_words
  while index < len(input_str):
    i = input_str.find(search_str, index)
    if i == -1:
      return text_list
    
    if input_str[i-number_of_words:i] == '':
      start = 0
    else:
      start = i - number_of_words

    text_list.append(input_str[start:i] + input_str[i:i+number_of_words])
    index = i + 1
  return text_list

In [None]:
documents = []
for index, row in corona_df.iterrows():
  documents.append(find_all_texts(row['text'], search_string, 40))

In [None]:
len(documents)

In [None]:
for doc in documents:
  if doc != []:
    print(doc)

In [None]:
for index, row in corona_df.iterrows():
  texts = find_all_texts(row['text'], search_string, 400)
  if texts == []:
    continue
  
  paper_id = row['paper_id']
  title = row['title']
  display(HTML(f'<h1>{search_string.upper()}</h1>'))
  display(HTML(f"""<p>
                      <strong>Title:</strong> {title}</br>
                      <strong>ID:</strong> {paper_id}</br>
                      <strong>Number of matches:</strong> {len(texts)}
                   </p>"""))
  for i in texts:
    marked_text = str(i.replace(search_string, f"<mark>{search_string}</mark>"))
    display(HTML(f"""<blockquote>... {marked_text} ...</blockquote>"""))  

Vantagens

Rápido
Fácil implementação
Muito útil para pesquisas simples com uma palavra
Bom para palavras-chave
Desvantagens

Somente uma palavra-chave
Não possui ordenação de importância
Considera somente a palavra "completa"

### Searching with Spacy

#### Testing spacy

In [None]:
string = 'spread wuhan city china infect traveller cause sporadic secondary transmission city secondary city epidemic'

In [None]:
search_strings = ['city', 'traveller']
tokens_list = [nlp(item) for item in search_strings]

In [None]:
tokens_list

In [None]:
print(nlp.vocab)

In [None]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
matcher.add('SEARCH', None, *tokens_list)

In [None]:
doc = nlp(string)
matches = matcher(doc)
matches

In [None]:
doc[2:3]

In [None]:
doc[10-5:10+5]

In [None]:
matches[0]

In [None]:
matches[0][1]

In [None]:
matches[0][2]

In [None]:
doc[matches[0][1]:matches[0][2]]

#### Implementing spacy on the dataset

In [None]:
search_strings = ['smoking', 'pulmonary disease']
tokens_list = [nlp(spacy_tokenizer(item)) for item in search_strings]
tokens_list

In [None]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
matcher.add('SEARCH', None, *tokens_list)
number_of_words = 50

In [None]:
search_strings_html = ' '.join([str(element) for element in search_strings])
search_strings_html

In [None]:
for index, row in corona_df.iterrows():
  marked_text = ''
  doc = nlp(row['text'])
  paper_id = row['paper_id']
  title = row['title']
  matches = matcher(doc)
  if matches == []:
    continue

  display(HTML(f'<h1>{search_strings_html.upper()}</h1>'))
  display(HTML(f"""<p>
                      <strong>Title:</strong> {title}</br>
                      <strong>ID:</strong> {paper_id}</br>
                      <strong>Number of matches:</strong> {len(matches)}
                   </p>"""))
  for i in matches:
    start = i[1] - number_of_words
    if start < 0:
      start = 0
    for j in range(len(tokens_list)):
      if doc[i[1]:i[2]].similarity(tokens_list[j]) == 1.0:
        search_text = str(tokens_list[j])
        marked_text += str(doc[start:i[2] + number_of_words]).replace(search_text, f"<mark>{search_text}</mark>")
        marked_text += "<br /><br />"
  display(HTML(f"""<blockquote>... {marked_text} ...</blockquote>"""))

Vantagens
- Fácil implementação
- Muito útil para pesquisas simples com mais palavras
- Bom para palavras-chave
- Usa uma biblioteca própria para processamento de linguagem natural

Desvantagens
- Não possui ordenação de importância

### Fuzzywuzzy for strings similarity
Distância Levenshtein: https://pt.wikipedia.org/wiki/Dist%C3%A2ncia_Levenshtein

#### Tests with fuzzywuzzy

In [None]:
!pip install fuzzywuzzy

In [None]:
!pip install python-Levenshtein

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# Similaridade da string em ordem
fuzz.ratio('Apple Inc.', 'Apple')

In [None]:
fuzz.partial_ratio('Apple Inc.', 'Apple')

In [None]:
# Ignora a ordem das palavras
fuzz.token_sort_ratio('Lakers x Chigaco Bulls', 'Chigago Bulls x Lakers')

In [None]:
# Ignora palavras duplicadas
fuzz.token_set_ratio('Today we have a great game: Lakers x Chigago Bulls', 'Chicago Bulls x Lakers')

#### Comparison with the text

In [None]:
search_string = 'Guidance on ways to scale up NPIs in a more coordinated way (e.g., establish funding, infrastructure and authorities to support real time, authoritative (qualified participants) collaboration with all states to gain consensus on consistent guidance and to mobilize resources to geographic areas where critical shortfalls are identified) to give us time to enhance our health care delivery system capacity to respond to an increase in cases. Rapid design and execution of experiments to examine and compare NPIs currently being implemented. DHS Centers for Excellence could potentially be leveraged to conduct these experiments'

Applying lemmatizer: 

In [None]:
search_string = spacy_tokenizer(search_string)
print(search_string)

In [None]:
ratio = []
partial_ratio = []
sort_ratio = []
set_ratio = []
for index, row in corona_df.iterrows():
  ratio.append(fuzz.ratio(row['text'], search_string))
  partial_ratio.append(fuzz.partial_ratio(row['text'], search_string))
  sort_ratio.append(fuzz.token_sort_ratio(row['text'], search_string))
  set_ratio.append(fuzz.token_set_ratio(row['text'], search_string))

In [None]:
len(ratio)

In [None]:
np.array(ratio).mean()

In [None]:
np.array(partial_ratio).mean()

In [None]:
np.array(sort_ratio).mean()

In [None]:
np.array(set_ratio).mean()

#### Comparison with abstract

In [None]:
corona_df.head()

In [None]:
corona_df['abstract'] = corona_df['abstract'].apply(spacy_tokenizer)

In [None]:
corona_df.head()

In [None]:
ratio = []
partial_ratio = []
sort_ratio = []
set_ratio = []
for index, row in corona_df.iterrows():
  ratio.append(fuzz.ratio(row['abstract'], search_string))
  partial_ratio.append(fuzz.partial_ratio(row['abstract'], search_string))
  sort_ratio.append(fuzz.token_sort_ratio(row['abstract'], search_string))
  set_ratio.append(fuzz.token_set_ratio(row['abstract'], search_string))

In [None]:
np.array(ratio).mean()

In [None]:
np.array(partial_ratio).mean()

In [None]:
np.array(sort_ratio).mean()

In [None]:
np.array(set_ratio).mean()

#### Getting the most similar papers

In [None]:
scores = {}
for index, row in corona_df.iterrows():
  scores[row['paper_id']] = fuzz.token_set_ratio(row['text'], search_string)

In [None]:
print(scores)

In [None]:
scores['532f2c636fca1caae1f23885b9dc0e3302a0afd5']

In [None]:
import operator
sorted_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse = True)

In [None]:
sorted_scores[0:10]

In [None]:
corona_df.loc[corona_df['paper_id'] == '68a7101a90454172c91785d8c352f776a82df5d4']

In [None]:
pd.set_option('display.max_colwidth', -1)
display(HTML(f'<h4>{search_string.upper()}</h4>'))
for i in sorted_scores[:10]:
  df = corona_df.loc[corona_df['paper_id'] == i[0]]
  display(HTML(f"""<p>
                      <strong>Title:</strong> {df['title']}</br>
                      <strong>ID:</strong> {i[0]}</br>
                      <strong>Score:</strong> {i[1]}</br>
                      <strong>Abstract:</strong> {str(df['abstract'])[0:700]}
                   </p>"""))

Vantagens

Biblioteca com várias funções
Útil tanto para pesquisas por palavras-chave quanto para textos maiores
Notas de acordo com a importância
Desvantagens

Modelo de cálculo matemático não é muito adequado

### Similar papers with TF-IDF and cosine similarity

#### TD-IDF

TF-IDF (Term frequency - inverse document frequency)
Redimensionar a frequência das palavras pela frequência com que aparecem em todos os documentos

Term frequency (TF): frequência da palavra no documento atual - TF = (número de vezes que o termo t aparece no documento) / (número de termos no documento)

Inverse document frequency (IDF): quão rara é a palavra nos documentos - IDF = log(N/n), N é o número de documentos e n é o número de documentos que o termo t apareceu

TF-IDF: importância de uma palavra para um documento em uma coleção ou corpus

Considerando um documento com 100 palavras no qual a palavra cachorro aparece 5 vezes

TF = 5 / 100 = 0.05

Temos 100 documentos no total (N) e a palavra cachorro aparece aparece em 20 desses documentos (n)

IDF = log(100 / 20) = 0.69

TF-IDF = 0.05 * 0.69 = 0.034

Quanto maior o valor do peso, mais raro é o termo. Quanto menor o peso, mais comum é o termo

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
texts = corona_df['text'][:3].tolist()
texts

In [None]:
texts.append(texts[0])
texts

In [None]:
tfidf = TfidfVectorizer()
vectorized = tfidf.fit_transform(texts)

In [None]:
type(vectorized)

In [None]:
print(tfidf.get_feature_names())

In [None]:
len(tfidf.get_feature_names())

In [None]:
print(tfidf.vocabulary_)

In [None]:
print(tfidf.idf_)

In [None]:
vectorized

In [None]:
vectorized.todense()

In [None]:
vectorized.todense().shape

#### Cos similarity

Link: https://en.wikipedia.org/wiki/Cosine_similarity
Cálculos passo a passo: https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/
Temos duas matrizes: TF e IDF
Cosine Similarity (d1, d2) = Dot product(d1, d2) / ||d1|| * ||d2||

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorized[0].todense()

In [None]:
cosine_similarity(vectorized[0], vectorized[1])

In [None]:
cosine_similarity(vectorized[0], vectorized[3])

In [None]:
similarity = cosine_similarity(vectorized[0], vectorized)
similarity

#### TD-IDF and cosine similarity in the dataset

In [None]:
texts = corona_df['text'].tolist()

In [None]:
texts[:10]

In [None]:
len(texts)

In [None]:
tfidf = TfidfVectorizer()
vectorized = tfidf.fit_transform(texts)

In [None]:
vectorized

In [None]:
search_string = 'Guidance on ways to scale up NPIs in a more coordinated way (e.g., establish funding, infrastructure and authorities to support real time, authoritative (qualified participants) collaboration with all states to gain consensus on consistent guidance and to mobilize resources to geographic areas where critical shortfalls are identified) to give us time to enhance our health care delivery system capacity to respond to an increase in cases. Rapid design and execution of experiments to examine and compare NPIs currently being implemented. DHS Centers for Excellence could potentially be leveraged to conduct these experiments'

In [None]:
search_string = spacy_tokenizer(search_string)
print(search_string)

In [None]:
search_string_vectorized = tfidf.transform([search_string])

In [None]:
search_string_vectorized

In [None]:
similarity = cosine_similarity(search_string_vectorized, vectorized)
similarity

In [None]:
len(similarity[0])

In [None]:
scores_dict = {}
for i in range(len(similarity[0])):
  scores_dict[i] = similarity[0][i]

In [None]:
print(scores_dict)

In [None]:
import operator
sorted_scores = sorted(scores_dict.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
print(sorted_scores)

In [None]:
display(HTML(f'<h4>{search_string.upper()}</h4>'))
for i in sorted_scores[:10]:
  df = corona_df.iloc[i[0]]

  display(HTML(f"""<p>
                      <strong>Title:</strong> {df['title']}</br>
                      <strong>ID:</strong> {df['paper_id']}</br>
                      <strong>Score:</strong> {i[1]}</br>
                      <strong>Abstract:</strong> {str(df['abstract'][0:700])}
                   </p></br>"""))

### Grouping the Papers

#### Treating the database

In [None]:
corona_df_completo = pd.read_csv('/content/gdrive/My Drive/corona_df_completo.csv')

In [None]:
corona_df_completo.shape

In [None]:
corona_df_completo = corona_df_completo.dropna()

In [None]:
corona_df_completo.shape

In [None]:
pd.set_option('display.max_colwidth', 100)
corona_df_completo.head()

#### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
dataset_texts = corona_df_completo['text'].tolist()

In [None]:
len(dataset_texts)

In [None]:
tfidf = TfidfVectorizer(max_features=2**12)
vectorized = tfidf.fit_transform(dataset_texts)
vectorized

#### Reduction of dimensionality

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_pca = pca.fit_transform(vectorized.toarray())

In [None]:
X_pca.shape

This resulted in a 2 columns matrix with all the 29k articles. 

In [None]:
components = pca.explained_variance_ratio_
components

In [None]:
sns.set(rc={'figure.figsize': (10,8)})
sns.scatterplot(X_pca[:,0], X_pca[:, 1])
plt.title('Covid-19 Papers');

#### Defining number of clusters

In [None]:
from sklearn.cluster import MiniBatchKMeans

In [None]:
wcss = []
for i in range(1, 21):
  kmeans = MiniBatchKMeans(n_clusters = i, random_state = 0)
  kmeans.fit(vectorized)
  wcss.append(kmeans.inertia_)
plt.plot(range(1, 21), wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')

#### Grouping with k-means

In [None]:
k = 5
kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(vectorized)

In [None]:
np.unique(y_pred)

In [None]:
len(y_pred)

In [None]:
palette = sns.color_palette('bright', len(set(y_pred)))
sns.scatterplot(X_pca[:,0], X_pca[:, 1], hue=y_pred, legend='full', palette=palette)
plt.title('Clustered Covid-19 Papers');

#### Visualizing results

In [None]:
# Based on: https://www.kaggle.com/maksimeren/covid-19-literature-clustering

In [None]:
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, CustomJS
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap
from bokeh.io import output_file, show
from bokeh.transform import transform
from bokeh.io import output_notebook
from bokeh.plotting import figure
from bokeh.layouts import column
from bokeh.models import RadioButtonGroup
from bokeh.models import TextInput
from bokeh.layouts import gridplot
from bokeh.models import Div
from bokeh.models import Paragraph
from bokeh.layouts import column, widgetbox

In [None]:
output_notebook()
y_labels = y_pred

# data sources
source = ColumnDataSource(data=dict(
    x= X_pca[:,0], 
    y= X_pca[:,1],
    x_backup = X_pca[:,0],
    y_backup = X_pca[:,1],
    desc= y_labels, 
    titles= corona_df_completo['title'],
    abstract = corona_df_completo['abstract'],
    labels = ["C-" + str(x) for x in y_labels]
    ))


In [None]:
# hover over information
hover = HoverTool(tooltips=[
    ("Title", "@titles{safe}"),
    ("Abstract", "@abstract{safe}"),
],
                 point_policy="follow_mouse")

# map colors
mapper = linear_cmap(field_name='desc', 
                     palette=Category20[20],
                     low=min(y_labels) ,high=max(y_labels))

# prepare the figure
p = figure(plot_width=800, plot_height=800, 
           tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset'], 
           title="Covid-19 Papers", 
           toolbar_location="right")

# plot
p.scatter('x', 'y', size=5, 
          source=source,
          fill_color=mapper,
          line_alpha=0.3,
          line_color="black",
          legend = 'labels')

# add callback to control 
callback = CustomJS(args=dict(p=p, source=source), code="""
            
            var radio_value = cb_obj.active;
            var data = source.data; 
            
            x = data['x'];
            y = data['y'];
            
            x_backup = data['x_backup'];
            y_backup = data['y_backup'];
            
            labels = data['desc'];
            
            if (radio_value == '20') {
                for (i = 0; i < x.length; i++) {
                    x[i] = x_backup[i];
                    y[i] = y_backup[i];
                }
            }
            else {
                for (i = 0; i < x.length; i++) {
                    if(labels[i] == radio_value) {
                        x[i] = x_backup[i];
                        y[i] = y_backup[i];
                    } else {
                        x[i] = undefined;
                        y[i] = undefined;
                    }
                }
            }


        source.change.emit();
        """)

# callback for searchbar
keyword_callback = CustomJS(args=dict(p=p, source=source), code="""
            
            var text_value = cb_obj.value;
            var data = source.data; 
            
            x = data['x'];
            y = data['y'];
            
            x_backup = data['x_backup'];
            y_backup = data['y_backup'];
            
            abstract = data['abstract'];
            titles = data['titles'];
            
            for (i = 0; i < x.length; i++) {
                if(abstract[i].includes(text_value) || 
                   titles[i].includes(text_value)  {
                    x[i] = x_backup[i];
                    y[i] = y_backup[i];
                } else {
                    x[i] = undefined;
                    y[i] = undefined;
                }
            }
        source.change.emit();
        """)

# option
option = RadioButtonGroup(labels=["C-0", "C-1", "C-2",
                                  "C-3", "C-4", "C-5",
                                  "C-6", "C-7", "C-8",
                                  "C-9", "C-10", "C-11",
                                  "C-12", "C-13", "C-14",
                                  "C-15", "C-16", "C-17",
                                  "C-18", "C-19", "All"], 
                          active=20, callback=callback)

# search box
keyword = TextInput(title="Search:", callback=keyword_callback)

#header
header = Div(text="""<h1>Covid-19 Papers</h1>""")

# show
show(column(header, widgetbox(option, keyword),p))

### Summarizing the papers

#### Test with the function

In [None]:
corona_df_original = pd.read_csv('/content/gdrive/My Drive/corona_df_original.csv')

In [None]:
corona_df_original.shape

In [None]:
corona_df_original.head()

In [None]:
!pip install bert-extractive-summarizer

In [None]:
from summarizer import Summarizer

In [None]:
text = str(corona_df_original['text'][0])
print(text)

In [None]:
len(text)

In [None]:
model = Summarizer()

In [None]:
result = model(text)

In [None]:
result

In [None]:
result = ''.join(result)
print(result)

In [None]:
len(result)