## COVID-19 : What has been published about medical care?
    -- Submitted By: Pradeep Joshi, Narendra Badam, Haripriya Iyer
    
### Top 10 articles for each sub question have been published    

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import json
from IPython.display import Image
from IPython.core.display import HTML
import re
from re import finditer
import spacy
import nltk, string
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

#python -m pip install covid19_tools
import covid19_tools as cvt


from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
import os

nlp = spacy.load("en_core_web_sm")  # load model package "en_core_web_sm"
#for dirname, _, filenames in os.walk('/kaggle/input'):
   # for filename in filenames:
        #print(os.path.join(dirname, filename))

import plotly.express as px
import plotly.graph_objects as go

# Any results you write to the current directory are saved as output.

In [None]:
## Loading the Data
corona_df = pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv')
corona_df.rename(columns={'sha':'paper_id'}, inplace=True) ## Renaming sha to paper_id
corona_df.head(3)

In [None]:
corona_df.shape #(51078, 18)

In [None]:
## Dropping Dulpicates and NA values from abstract column
corona_df.drop_duplicates(['abstract'], inplace=True)
corona_df.dropna(subset=['abstract'], inplace=True)

corona_df.shape

In [None]:
df_corona = corona_df.drop(columns = ['paper_id', 'source_x', 'pmcid', 'license', 'Microsoft Academic Paper ID', \
                               'WHO #Covidence', 'has_pdf_parse', 'has_pmc_xml_parse', 'full_text_file'])
df_corona.shape

In [None]:
## Sub Tasks within the Tasks
medtasks_df = pd.DataFrame({'sub_tasks': ['Resources to support skilled nursing facilities and long term care facilities',
    'Mobilization of surge medical staff to address shortages in overwhelmed communities',
    'Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) with/without other organ failure – particularly \
      for viral etiologies',
    'Extracorporeal membrane oxygenation (ECMO) outcomes data of COVID-19 patients',
    'Outcomes data for COVID-19 after mechanical ventilation adjusted for age',
    'Knowledge of the frequency, manifestations, and course of extrapulmonary manifestations of COVID-19, including, but not \
      limited to, possible cardiomyopathy and cardiac arrest',
    'Application of regulatory standards (e.g., EUA, CLIA) and ability to adapt care to crisis standards of care level',
    'Approaches for encouraging and facilitating the production of elastomeric respirators, which can save thousands of N95 masks',
    'Best telemedicine practices, barriers and faciitators, and specific actions to remove/expand them within and across state \
      boundaries',
    'Guidance on the simple things people can do at home to take care of sick people and manage disease',
    'Oral medications that might potentially work',
    'Use of Artificial Intelligence AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in \
      a way that could not be done manually',
    'Best practices and critical challenges and innovative solutions and technologies in hospital flow and organization, workforce \
    protection, workforce allocation, community-based support resources, payment, and supply chain management to enhance capacity,\
      efficiency, and outcomes',
    'Efforts to define the natural history of disease to inform clinical care, public health interventions, infection prevention \
      control, transmission, and clinical trials',
    'Efforts to develop a core clinical outcome set to maximize usability of data across a range of trials',
    'Efforts to determine adjunctive and supportive interventions that can improve the clinical outcomes of infected patients \
     (e.g. steroids, high flow oxygen)',
]})
medtasks_df

In [None]:
## Data Cleansing for Questions Asked
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

In [None]:
## Converting to Lowercase
medtasks_df['sub_tasks_lower'] = medtasks_df['sub_tasks'].apply(lambda x: " ".join(x.lower() for x in x.split()))
medtasks_df

In [None]:
## Remove Punctuations
medtasks_df['sub_tasks_lower'] = medtasks_df['sub_tasks_lower'].str.replace('[^\w\s]','')
medtasks_df

In [None]:
## Stop Words in Questions and their Removal
medtasks_df['stopwords'] = medtasks_df['sub_tasks'].apply(lambda x: len([x for x in x.split() if x in stop_words]))

medtasks_df['sub_tasks_lower'] = medtasks_df['sub_tasks_lower'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

medtasks_df['stopwords_left'] = medtasks_df['sub_tasks_lower'].apply(lambda x: len([x for x in x.split() if x in stop_words]))
medtasks_df

In [None]:
## Tokenization (dividing the text into a sequence of words or sentences) and 
## Lemmatization (converts the word into its root word, rather than just stripping the suffices (Stemming))

def tokenize_tasks(text):
    return ' '.join(lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text))

medtasks_df['sub_tasks_lower'] = medtasks_df.sub_tasks_lower.apply(tokenize_tasks)
medtasks_df

In [None]:
## Dropping unnecessary Columns and Renaming Columns
df_medtasks = medtasks_df.drop(columns=['stopwords', 'stopwords_left'])
df_medtasks = df_medtasks.rename(columns={"sub_tasks_lower": "sub_tasks_cleansed"})
df_medtasks

In [None]:
## Reduce literature set to include those mentioning Covid 19 or its synonyms. 
## This is acheived using the method supplied in covid19-tools provided by Andy White. 

## Does paper discuss Covid-19, SARS, MERS, etc.? Looking for papers that specifically refer to the recent outbreak, 
## known variously as Covid-19, SARS-CoV-2, 2019-nCoV, Wuhan Pneumonia etc.
covid19_synonyms = ['covid','coronavirus disease 19','sars cov 2','2019 ncov','2019ncov',r'2019 n cov\b', r'2019n cov\b',
                    'ncov 2019',r'\bn cov 2019','coronavirus 2019','wuhan pneumonia','wuhan virus','wuhan coronavirus',
                    r'coronavirus 2\b']

## Counts Synonyms and adds disease_covid19 column to DF
df_corona, covid_vals = cvt.count_and_tag(df_corona, covid19_synonyms, 'disease_covid19')

## Check the Papers which discuss Covid
df_corona.tag_disease_covid19.value_counts()

In [None]:
## Filtering data where the disease_covid19 is True
covid_df = df_corona[df_corona['tag_disease_covid19'] == True ]
covid_df = covid_df.reset_index()
covid_df = covid_df.drop(['index'], axis=1)
covid_df

In [None]:
## Displaying Counts of Covid and it's Synonyms
covid_vals.sort_values(ascending=False)

In [None]:
## Plotting Covid and it's Synonym Counts
fig = go.Figure(data=go.Scatter(x=covid_vals.sort_values(),
                                y=covid_vals.sort_values().index.values,
                                mode='lines+markers',
                                marker=dict(color="red", size=12)
))

fig.update_layout(title='Count of Covid Synonyms', xaxis_title='Count', yaxis_title='Covid and Synonyms')
fig.show()

In [None]:
## Breaking abstracts into sentences
covid_df['org abstract'] = covid_df['abstract']
covid_df_by_sentence = covid_df.set_index(covid_df.columns.drop('abstract',1).tolist())\
.abstract.str.split('\. ', expand=True).stack().reset_index()\
.rename(columns={0:'abstract'})
covid_df_by_sentence

In [None]:
## Converting to Lowercase
covid_df_by_sentence['abstract_lower'] = covid_df_by_sentence['abstract'].apply(lambda x: " ".join(x.lower() for x in x.split()))
covid_df_by_sentence

## Remove Punctuations
covid_df_by_sentence['abstract_lower'] = covid_df_by_sentence['abstract_lower'].str.replace('[^\w\s]','')
covid_df_by_sentence

## Stop Words in Questions and their Removal
covid_df_by_sentence['stopwords'] = covid_df_by_sentence['abstract_lower'].apply(lambda x: len([x for x in x.split() if x in stop_words]))

covid_df_by_sentence['abstract_lower'] = covid_df_by_sentence['abstract_lower'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

covid_df_by_sentence['stopwords_left'] = covid_df_by_sentence['abstract_lower'].apply(lambda x: len([x for x in x.split() if x in stop_words]))
covid_df_by_sentence

## Tokenization and Lemmatization 
covid_df_by_sentence['abstract_lower'] = covid_df_by_sentence.abstract_lower.apply(tokenize_tasks)
covid_df_by_sentence

In [None]:
## Dropping unnecessary Columns and Renaming Columns
sentence_df = covid_df_by_sentence.drop(columns=['stopwords', 'stopwords_left'])
sentence_df = sentence_df.rename(columns={"abstract_lower": "abstract_cleansed"})
sentence_df

In [None]:
## Creating a complete combined list of questions and abstracts
lst_complete = ["".join(x) for x in (df_medtasks['sub_tasks_cleansed'])]
lst_complete
lst_complete = lst_complete + ["".join(x) for x in (sentence_df['abstract_cleansed'])]
lst_complete

In [None]:
## Embedding through USE(Universal Sentence Encoder) - a tensor flow utility
import tensorflow as tf
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed(lst_complete)
#print(embeddings)

In [None]:
## Semantic similarity of two sentences can be trivially computed as the inner product of the encodings
import seaborn as sns
def plot_similarity(labels, features, rotation):
    corr = np.inner(features, features)
    sns.set(font_scale=1.2)
    g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")

plot_similarity(lst_complete[0:5], embeddings[0:5], 90)

In [None]:
corr = np.inner(embeddings, embeddings)
corr

In [None]:
leng = len(medtasks_df)
#sentence_df
sim_score_df = pd.DataFrame({'Similarity':corr[leng:,0], 'Abstract_per_Sent':sentence_df['abstract'],\
                             'Date':sentence_df['publish_time'],'Title':sentence_df['title'],\
                             'Authors':sentence_df['authors'],\
                             'Abstract':sentence_df['org abstract'],\
                             'URL':sentence_df['url']})
#sim_score_df.head()

In [None]:
## Finding Articles for every question
def article_search (index,corr,sentence_df,leng):

    sim_score_df = pd.DataFrame({'Similarity':corr[leng:,index], 'Abstract_per_Sent':sentence_df['abstract'],\
                             'Date':sentence_df['publish_time'],'Title':sentence_df['title'],'Authors':sentence_df['authors'],\
                             'Abstract':sentence_df['org abstract'],'URL':sentence_df['url']})
    
    df_sim_score = sim_score_df.sort_values('Similarity',ascending = False )

    ## Top 50 articles
    df_sim50 = df_sim_score[:10]

    df_sim50 = df_sim50.reset_index()
    
    ## Dropping unnecessary Columns
    df_sim50 = df_sim50.drop(['index'], axis=1)
    df_sim50 = df_sim50.drop(['Similarity'], axis=1)

    df_sim50 = df_sim50.apply(lambda x: x.str.slice(0, 1000))
    df_sim50["Authors"] = df_sim50["Authors"].str[:100]
    
    return df_sim50

## 1. Resources to support skilled nursing facilities and long term care facilities

In [None]:
ques1_Ans = article_search (0,corr,sentence_df,len(medtasks_df))
#ques1_Ans
ques1_Ans = ques1_Ans.style.set_properties(**{'text-align': 'left'})
ques1_Ans

## 2. Mobilization of surge medical staff to address shortages in overwhelmed communities

In [None]:
ques2_Ans = article_search (1,corr,sentence_df,len(medtasks_df))
#ques2_Ans
ques2_Ans = ques2_Ans.style.set_properties(**{'text-align': 'left'})
ques2_Ans

## 3. Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) with/without other organ failure – particularly for viral etiologies

In [None]:
ques3_Ans = article_search (2,corr,sentence_df,len(medtasks_df))
#ques3_Ans
ques3_Ans = ques3_Ans.style.set_properties(**{'text-align': 'left'})
ques3_Ans

## 4. Extracorporeal membrane oxygenation (ECMO) outcomes data of COVID-19 patients

In [None]:
ques4_Ans = article_search (3,corr,sentence_df,len(medtasks_df))
#ques4_Ans
ques4_Ans = ques4_Ans.style.set_properties(**{'text-align': 'left'})
ques4_Ans

## 5. Outcomes data for COVID-19 after mechanical ventilation adjusted for age

In [None]:
ques5_Ans = article_search (4,corr,sentence_df,len(medtasks_df))
#ques5_Ans
ques5_Ans = ques5_Ans.style.set_properties(**{'text-align': 'left'})
ques5_Ans

## 6. Knowledge of the frequency, manifestations, and course of extrapulmonary manifestations of COVID-19, including, but not limited to, possible cardiomyopathy and cardiac arrest

In [None]:
ques6_Ans = article_search (5,corr,sentence_df,len(medtasks_df))
#ques6_Ans
ques6_Ans = ques6_Ans.style.set_properties(**{'text-align': 'left'})
ques6_Ans

## 7. Application of regulatory standards (e.g., EUA, CLIA) and ability to adapt care to crisis standards of care level

In [None]:
ques7_Ans = article_search (6,corr,sentence_df,len(medtasks_df))
#ques7_Ans
ques7_Ans = ques7_Ans.style.set_properties(**{'text-align': 'left'})
ques7_Ans

## 8. Approaches for encouraging and facilitating the production of elastomeric respirators, which can save thousands of N95 masks

In [None]:
ques8_Ans = article_search (7,corr,sentence_df,len(medtasks_df))
#ques8_Ans
ques8_Ans = ques8_Ans.style.set_properties(**{'text-align': 'left'})
ques8_Ans

## 9. Best telemedicine practices, barriers and faciitators, and specific actions to remove/expand them within and across state boundaries

In [None]:
ques9_Ans = article_search (8,corr,sentence_df,len(medtasks_df))
#ques9_Ans
ques9_Ans = ques9_Ans.style.set_properties(**{'text-align': 'left'})
ques9_Ans

## 10. Guidance on the simple things people can do at home to take care of sick people and manage disease

In [None]:
ques10_Ans = article_search (9,corr,sentence_df,len(medtasks_df))
#ques10_Ans
ques10_Ans = ques10_Ans.style.set_properties(**{'text-align': 'left'})
ques10_Ans

## 11. Oral medications that might potentially work

In [None]:
ques11_Ans = article_search (10,corr,sentence_df,len(medtasks_df))
#ques11_Ans
ques11_Ans = ques11_Ans.style.set_properties(**{'text-align': 'left'})
ques11_Ans

## 12. Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually

In [None]:
ques12_Ans = article_search (11,corr,sentence_df,len(medtasks_df))
#ques12_Ans
ques12_Ans = ques12_Ans.style.set_properties(**{'text-align': 'left'})
ques12_Ans

## 13. Best practices and critical challenges and innovative solutions and technologies in hospital flow and organization, workforce protection, workforce allocation, community-based support resources, payment, and supply chain management to enhance capacity, efficiency, and outcomes

In [None]:
ques13_Ans = article_search (12,corr,sentence_df,len(medtasks_df))
#ques13_Ans
ques13_Ans = ques13_Ans.style.set_properties(**{'text-align': 'left'})
ques13_Ans

## 14. Efforts to define the natural history of disease to inform clinical care, public health interventions, infection prevention control, transmission, and clinical trials

In [None]:
ques14_Ans = article_search (13,corr,sentence_df,len(medtasks_df))
#ques14_Ans
ques14_Ans = ques14_Ans.style.set_properties(**{'text-align': 'left'})
ques14_Ans

## 15. Efforts to develop a core clinical outcome set to maximize usability of data across a range of trials

In [None]:
ques15_Ans = article_search (14,corr,sentence_df,len(medtasks_df))
#ques15_Ans
ques15_Ans = ques15_Ans.style.set_properties(**{'text-align': 'left'})
ques15_Ans

## 16. Efforts to determine adjunctive and supportive interventions that can improve the clinical outcomes of infected patients (e.g. steroids, high flow oxygen)

In [None]:
ques16_Ans = article_search (15,corr,sentence_df,len(medtasks_df))
#ques16_Ans
ques16_Ans = ques16_Ans.style.set_properties(**{'text-align': 'left'})
ques16_Ans