## Imports

In [None]:
import numpy as np
import pandas as pd
import os
import json
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

#Path
path= "/kaggle/input/CORD-19-research-challenge/document_parses/pdf_json"
dir_list= os.listdir(path)

def clean_text(rgx_list, text):
    new_text = text.lower()
    for rgx_match in rgx_list:
        new_text = re.sub(rgx_match, '', new_text)   
    return new_text

## Get jsons

In [None]:
patterns= [r'[\/#!$\^&\*;:{}=\-\[\]_`~()“”…—\"\"‘’\'\'–]',
           r'\(\w+\)', r'\[\w+\]',
           r'[\t\n\r\f\v\d\xa0]']

all_texts= []
str_texts=""
for file in dir_list[5000:7000]:
    #Open json
    with open(path+'/'+file) as json_file:
        data= json.load(json_file)
        
        title= data['metadata']['title']
        body= data['body_text']
        
        texts= []
        for text in body:
            te= text['text'].strip()
            te= clean_text(patterns, te)
            texts.append(te)
        
        texts2= ' '.join(texts)
        str_texts += texts2
        
        tokens= word_tokenize(texts2)
        tef= nltk.Text(tokens)
        
        all_texts.append(
                        #{
                          #'title': title,
                        tef
                        #}
                        )
            
print(all_texts[0])

In [None]:
"""co= {'cov': all_texts}

import json
with open('covid.json', 'w') as fp:
    json.dump(co, fp)
    
from IPython.display import FileLink
FileLink(r'./covid.json')"""

## Collocations

In [None]:
"""
Collocations: 
Influenza vaccine, vaccine efficacy, sars vaccine, vaccine uptake, 
varicella vaccines, malaria vaccines
"""

for tok in all_texts:
    tok.collocations()
    print()

# Vaccines

In [None]:
"""
Found: 
-DNA vaccines are stable.
-Vaccine preparation is time-consuming.
"""
for do in all_texts[:500]:
    do.concordance('vaccine')
    print()

In [None]:
"""
Found: 
-DNA-based vaccines have advantages such as 
stability, 
gene manipulation, and 
large-scale manufacturing.
"""

for tok in all_texts:
    tok.findall(r"<dna> <[\w\.]+>{,40} <vaccine> <[\w\.]+>{,40}|<[\w\.]+>{,40} <vaccine> <[\w\.]+>{,40} <dna>")

In [None]:
"""
Found: 
-An epidemiological study by Melamed et al. found that low 
levels of vitamin d increase the risk of mortality.
-Vitamin d affects the risk of mortality from Covid-19.
-Some authors reported that the mean level of oh vitamin d 
level was inversely correlated with covid infection rates and mortality.
-Vitamin d level was an independent predictor of covid related mortality.
-Also found evidence that links vitamin d deficiency to covid mortality.
-A study done in iran found that improving vitamin d status in the 
general population and particularly hospitalized patients have a 
potential benefit in reducing the severity of morbidities and mortality.
-Emerging studies on covid indicate the association of vitamin d
deficiency with covid severity and mortality.
-Mortality caused by covid and average serum vitamin d levels in
european countries and showed a strong association between low vitamin
levels and cases of infection by coronavirus.
"""

for tok in all_texts:
    tok.findall(r"<mortality> <[\w\.]+>{,20} <vitamin> <d> <[\w\.]+>{,20}|<[\w\.]+>{,20} <vitamin> <d> <[\w\.]+>{,20} <mortality>")

# COVID-19 risk in childrens

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
  
df = pd.read_csv(r"/kaggle/input/CORD-19-research-challenge/metadata.csv", encoding ="latin-1")
  
comment_words = ''
stopwords = set(STOPWORDS)
  
# iterate through the csv file
for val in df['title']:
    # typecaste each val to string
    val = str(val)
    # split the value
    tokens = val.split()
    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
    comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(comment_words)
  
# plot the WordCloud image                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

In [None]:
from nltk import word_tokenize, Text
s = " ".join([str(l) for l in df['title'].values.tolist()])
texts = Text(word_tokenize(s))

In [None]:
texts.concordance("children")

In [None]:
texts.concordance("adults")

In [None]:
texts.findall(r"<.*>{,2} <children|Children> <.*>{,6} <death|risk>")

# Asymptomatic

In [None]:
"""
Found: 
-Children appear to be more likely than adults to present with
asymptomatic or mild disease.
-Children have mild severe acute respiratory syndrome coronavirus
sarscov confirmed disease compared to adults, and up to onethird are
asymptomatic. recent work suggests that children are also less likely
to become infected with sarscov, which is strikingly different to the
higher prevalence and severity observed in children.
-Children, do get infected but are nearly asymptomatic. 
-Children and adolescents tend to be asymptomatic or mildly ill, it is
presumed that they are more at risk of spreading the disease to
elderly. this is based on past flu pandemic experience.
-Children were a protected group, but this may have been because they
were less likely to have frequented the wuhan wet market, or because
they were more likely to have asymptomatic or mild disease and thus
less likely to have been tested.
-Children remain otherwise asymptomatic and, interestingly, often test
negative for sarscov in nasopharyngeal samples.
-Asymptomatic infections were reported to be common among 
confirmed cases in children.
-An italian survey reported a time increase in the rate of kawasakilike
presentation during the covid pandemic among children. 
The nasopharyngeal swabs taken from these children were negative, once
again putting into discussion a direct responsibility of sarscov infection.
-Children had negative covid testing by nasopharyngeal reverse 
transcriptase polymerase chain reaction rt pcr.
"""

for tok in all_texts:
    #tok.findall(r"<[\w\.]+>{30} <vaccie> <[\w\.]+>{30}")
    tok.findall(r"<[\w\.,]+>{,20} <children> <[\w\.,]+>{,30} <nasopharyngeal> <[\w\.,]+>{,30}|<[\w\.,]+>{,40} <nasopharyngeal> <[\w\.,]+>{,40} <children> <[\w\.,]+>{,20}")

# Risk factors

In [None]:
sentences = nltk.sent_tokenize(str_texts)

In [None]:
"""
Found:
- Preexisting endothelial dysfunction explains how patients with old age, obesity, hypertension and
  diabetes mellitus are at a higher risk for a fatal outcome when suffering from covid. 
- Comorbilidades como hipertensión, enfermedad coronaria y diabetes se han asociado con un riesgo
  significativamente mayor de muerte entre los pacientes con la covid.
- retrospective series report a higher mortality rate of covid patients when on active cancer
  treatment receiving chemotherapy.
- Some studies using IRT have shown that patients with diabetes and neuropathy have higher plantar
  temperature than individuals without this pathology.
- It was also observed that male participants showed a significantly better psychological and
  emotional health than females.
"""

covid = ['covid','covid19','corona','coronavirus','corona-virus','SARS','SARSCOV2','severe acute resperatory syndrom']
risk_factors = ["hypertension", "diabetes", "male", "males", "heart disease", "codp", "smoking",
                "cerebrovascular", "cardiovascular", "cancer", "respiratory disease", "chronic kidney disease",
               "chronic respiratory disease", "beer", "drink", "obese", "overweight", "chronic liver disease"]
i=0
for s in sentences:
    string = s.lower()
    if any(factor in string for factor in risk_factors) and any(factor in string for factor in covid):
        print("["+str(i)+"]-", s, "\n")
    i+=1