### To gain quick data insights, we constructed a Keywords-Article Matrix as an overview approach to get a profile and a preliminary evaluation of the dataset.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.cluster import DBSCAN
from nltk.corpus import stopwords
from spacy.matcher import Matcher 
from collections import  Counter
import matplotlib.pyplot as plt
from spacy.tokens import Span 
import tensorflow_hub as hub
#from rake_nltk import Rake
import tensorflow as tf
import pyLDAvis.gensim
from tqdm import tqdm
import seaborn as sns
import networkx as nx
import pandas as pd
import numpy as np
import pyLDAvis
import gensim
import spacy
import os
import gc
import re

from scipy.spatial import distance
from wordcloud import WordCloud
import matplotlib.pyplot as plt

import nltk

import gensim.corpora as corpora
from gensim import models
from gensim.utils import simple_preprocess
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import similarities



!pip install -U sentence-transformers

# Library from here: https://github.com/UKPLab/sentence-transformers
from sentence_transformers import SentenceTransformer
#pd.describe_option('display')
pd.options.display.max_seq_items = 2500
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)

In [None]:
# Cleaned dataset from this kernel: https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv
!ls /kaggle/input/cord-19-eda-parse-json-and-generate-clean-csv/

Load DataFrame of Cleaned Documents

In [None]:
CLEAN_DATA_PATH = "../input/cord-19-eda-parse-json-and-generate-clean-csv/"

pmc_df = pd.read_csv(CLEAN_DATA_PATH + "clean_pmc.csv")
biorxiv_df = pd.read_csv(CLEAN_DATA_PATH + "biorxiv_clean.csv")
comm_use_df = pd.read_csv(CLEAN_DATA_PATH + "clean_comm_use.csv")
noncomm_use_df = pd.read_csv(CLEAN_DATA_PATH + "clean_noncomm_use.csv")


In [None]:

papers_df_el = pd.concat([pmc_df,
                       biorxiv_df,
                       comm_use_df,
                       noncomm_use_df], axis=0).reset_index(drop=True)

In [None]:
papers_df_el['combined']=papers_df_el['title']+ ' '+papers_df_el['abstract']

In [None]:
papers_df_el.head(1)

In [None]:
papers_df_el.dropna(inplace=True)
papers_df_el = papers_df_el.drop_duplicates(subset=['title'], keep=False)
papers_combined_el = papers_df_el['combined'].str.lower().tolist()

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['also', 'may', 'however', 'could',"''",'=','.','(',')','abstract', 'found', 'using','used','result','including','based','although','among','two','three','one','or','use'])

def common_words_graph(df,col, n):

    corpus=[]
    lem=WordNetLemmatizer()
    new= df[col].dropna().str.split()
    new=new.values.tolist()
    corpus=[lem.lemmatize(word.lower()) for i in new for word in i if word not in stop_words]
    corpus=[word for word in corpus if word not in stop_words]
    counter=Counter(corpus).most_common()[:n]
    
    top_words = [x[0] for x in counter]
    top_counts = [x[1] for x in counter]
    
    
    plt.figure(figsize=(9,7))
    sns.barplot(x=top_counts,y=top_words)
    plt.title('Top '+str(n)+' words in '+col)
    plt.show()


# List top 20 words occured in papers' Title 

In [None]:
common_words_graph(papers_df_el, 'title',20)

# List top 20 words occured in papers' Abstract

In [None]:
common_words_graph(papers_df_el, 'abstract',20)

In [None]:
def clean_word_round1(text):
    text = text.lower()
    text = text.replace('(',"")
    text = text.replace(')',"")
    text = text.replace('=',"")
    text = text.replace('-',"")
    text= text.split()
    #lem=WordNetLemmatizer()
    #corpus=[lem.lemmatize(word.lower()) for word in text]
    corpus=' '.join([word for word in text if word not in stop_words])
    return corpus

In [None]:
c_combined_words= papers_df_el['combined'].apply(clean_word_round1).tolist()

In [None]:
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    

# Visualize the frequency of words occured in papers' Title & Abstract in relation to the scale of text size. The bigger font size, the higher frequency.

In [None]:
show_wordcloud(c_combined_words, title = 'Title and abstract')

# List top words occured in papers' Title & Abstract

In [None]:
# just check the words
cv = CountVectorizer()
combined_matrix = cv.fit_transform(c_combined_words)

combined =cv.get_feature_names()
combined_key_words_df = pd.DataFrame(combined_matrix.sum(axis =0).T,index = combined, columns =['count'] )

fig, ax = plt.subplots(figsize = (10,10))
combined_key_words_df.sort_values(by ='count', ascending = False).iloc[:20].sort_values('count', ascending=True).plot(kind='barh', ax = ax)

# Count how many key words from every article's combined abstract and title fall into sub-questions' key word list

In [None]:
# key_words 
common_words = ['ncov','covid','cov','sars','coronavirus','medical care','health care']

#1. Resources to support skilled nursing facilities and long term care facilities.
key_words_dic = {1 : ['skilled nursing','long term care','ltc', 'resource','facilities','facility','health care'],

#2. Mobilization of surge medical staff to address shortages in overwhelmed communities
 2 : ['mobilization', 'mobilize','mobilization', 'surge', 'medical', 'medical staff', 'medical professional'
                 'overwhelmed communities','overwhelmed','communities','shortage','lack','shortfall'],

#3. Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) with/without other organ failure â€“ particularly for viral etiologies
 3 : ['age-adjusted mortality','age-adjusted','mortality data','acute respiratory', 'distress syndrome',
                 'ards', 'organ failure','etiologies','etiology','senior','respiratory','mortality','viral etiologies'],

#4. Extracorporeal membrane oxygenation (ECMO) outcomes data of COVID-19 patients
4 : ['extracorporeal', 'membrane','oxygenation','ecmo', 'extracorporeal membrane oxgenation','covid-19','patients','patients'],

#5. Outcomes data for COVID-19 after mechanical ventilation adjusted for age'
5 : ['mechanical','ventilation', 'age', 'adjusted for age', 'mechanical ventilation adjusted for age'],

#6. Knowledge of the frequency, manifestations, and course of extrapulmonary manifestations of COVID-19, including, but not limited to, possible cardiomyopathy and cardiac arrest'
6 : ['frequency','manifestations', 'manifestation', 'extrapulmonary','course of extrapulmonary', 'extrapulmonary manifestation'
                 'cardiomyopathy', 'cardiac arrest', 'cardiac'],

#7. Application of regulatory standards (e.g., EUA, CLIA) and ability to adapt care to crisis standards of care level.
7 : ['application of regulatroy standard','regulatroy standard', 'eua','clia', 'ability', 'care','crisis standard', 'care level'],

#8. Approaches for encouraging and facilitating the production of elastomeric respirators, which can save thousands of N95 masks.
8 : ['approach','encourage','facilitate','elastomeric respirator','elastomeric', 'respiratory','n95', 'mask'],

#9. Best telemedicine practices, barriers and faciitators, and specific actions to remove/expand them within and across state boundaries.
9 : ['telemedicine','telemedicine practices','barrier', 'action', 'boundaries','boundary','remove','expand'],

#10. Guidance on the simple things people can do at home to take care of sick people and manage disease.
10 : ['guidance', 'home','take care', 'sick people', 'diesease'],

#11. Oral medications that might potentially work.
11 : ['oral', 'mediation', 'oral medication'],

#12. Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually.
12 : ['ai', 'real time', 'intervention', 'risk factor', 'factor', 'delivery'],
#13. Best practices and critical challenges and innovative solutions and technologies in hospital flow and organization, workforce protection, workforce allocation, community-based support resources, payment, and supply chain management to enhance capacity, efficiency, and outcomes.
13 : ['practice','challenge','critical','innovative','solution', 'techonology', 'hospital flow', 'hospital', 'organization','workforce','protection', 'community-based support resource'],

#14. Efforts to define the natural history of disease to inform clinical care, public health interventions, infection prevention control, transmission, and clinical trials
14 : ['effort','natural history', 'clinical care','clinical', 'public health intervention', 'infection', 'prevention', 'control','transmission','trial'],

#15. Efforts to develop a core clinical outcome set to maximize usability of data across a range of trials
15 : ['effort', 'clinical','outcome', 'maximize','usability','trial'],

#16. Efforts to determine adjunctive and supportive interventions that can improve the clinical outcomes of infected patients (e.g. steroids, high flow oxygen)
16 : ['adjunctive', 'supportive', 'intervention', 'clinical','improve', 'outcome', 'infected','patient', 'steroid','oxyen']}

In [None]:
papers_df_el['c_combined'] = papers_df_el['combined'].apply(clean_word_round1)

In [None]:
def count_key_words(row,num): 
    total_count = 0
    
    for word in key_words_dic[num]:
        counts = len(re.findall(word, row))
        total_count += counts
    return total_count

In [None]:
for i in range(1, 17):
    col = 'q_'+str(i)
    papers_df_el[col] = papers_df_el['c_combined'].apply(lambda x: count_key_words(x,num=i))

In [None]:
count_df_el=papers_df_el[['q_1','q_2','q_3','q_4',
          'q_5','q_6','q_7','q_8',
          'q_9','q_10','q_11','q_12',
          'q_13','q_14','q_15','q_16']]

In [None]:

count_df_el

In [None]:
papers_df_el['title'].loc[40]

In [None]:
papers_df_el.loc[40]