# **Incubation Period Across age groups AI Lit. Review with Table AI-LRT**

![](https://sportslogohistory.com/wp-content/uploads/2018/09/georgia_tech_yellow_jackets_1991-pres-1.png)



In [None]:
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nlp = spacy.load('en_core_web_lg')
import numpy as np
import pandas as pd
!pip install bert-extractive-summarizer
from summarizer import Summarizer
model = Summarizer()
import os
import json
from pprint import pprint
from copy import deepcopy
import math
import torch
from transformers import *
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
# keep only documents with covid -cov-2 and cov2
def search_focus(df):
    dfa = df[df['abstract'].str.contains('covid')]
    dfb = df[df['abstract'].str.contains('-cov-2')]
    dfc = df[df['abstract'].str.contains('cov2')]
    dfd = df[df['abstract'].str.contains('ncov')]
    frames=[dfa,dfb,dfc,dfd]
    df = pd.concat(frames)
    df=df.drop_duplicates(subset='title', keep="first")
    return df

# load the meta data from the CSV file using 3 columns (abstract, title, authors),
df=pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv', usecols=['title','journal','abstract','authors','doi','publish_time','sha','full_text_file'])
print (df.shape)
#fill na fields
df=df.fillna('no data provided')
#drop duplicate titles
df = df.drop_duplicates(subset='title', keep="first")
#keep only 2020 dated papers
df=df[df['publish_time'].str.contains('2020')]
# convert abstracts to lowercase
df["abstract"] = df["abstract"].str.lower()+df["title"].str.lower()
#show 5 lines of the new dataframe
df=search_focus(df)
print (df.shape)
df.head()

In [None]:
from IPython.core.display import display, HTML
import functools

def remove_stopwords(text,stopwords):
    text = "".join(c for c in text if c not in ('!','.',',','?','(',')','-'))
    text_tokens = word_tokenize(text)
    #remove stopwords
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    str1=''
    str1=' '.join(word for word in tokens_without_sw)
    return str1
### spacy score sentence
def score_sentence(search,sentence):
        main_doc=nlp(sentence)
        search_doc=nlp(search)
        sent_score=main_doc.similarity(search_doc)
        return sent_score

# custom sentence score
def score_sentence_prob(search,sentence):
    keywords=search.split()
    sent_parts=sentence.split()
    word_match=0
    missing=0
    for word in keywords:
        word_count=sent_parts.count(word)
        word_match=word_match+word_count
        if word_count==0:
            missing=missing+1
    percent = 1-(missing/len(keywords))
    final_score=abs((word_match/len(sent_parts)) * percent)
    if missing==0:
        final_score=final_score+1
    return final_score

# BERT pretrained question answering module
def answer_question(question,text, model,tokenizer):
    input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
    input_ids = tokenizer.encode(input_text)
    token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
    start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
    all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    #print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
    answer=(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
    # show qeustion and text
    #tokenizer.decode(input_ids)
    return answer

def score_sentence_study(sentence):
        search='included total cases patients sample size collected gathered enrolled study'
        #search='how many patients cases included in study cohort meta-analysis prospective retrospective'
        main_doc=nlp(sentence)
        search_doc=nlp(search)
        sent_score=main_doc.similarity(search_doc)
        return sent_score

def process_question(df,search,focus):
    df_table = pd.DataFrame(columns = ["date","study","link","journal","days","range","sample","study","excerpt","rel_score"])
    # focuses to make sure the exact phrase in text
    #df1 = df[df['abstract'].str.contains(focus)]
    # focus to make sure all words in text
    df1=df[functools.reduce(lambda a, b: a&b, (df['abstract'].str.contains(s) for s in focus))]
    search=remove_stopwords(search,stopwords)
    for index, row in df1.iterrows():
        sentences = row['abstract'].split('. ')
        pub_sentence=''
        hi_score=0
        study=''
        hi_study_score=0
        for sentence in sentences:
            if len(sentence)>75:
                rel_score=score_sentence(search,sentence)
                #rel_score=score_sentence_prob(search,sentence)
                if rel_score>.70:
                    sentence=sentence.capitalize()
                    if sentence[len(sentence)-1]!='.':
                        sentence=sentence+'.'
                    pub_sentence=pub_sentence+' '+sentence
                    if rel_score>hi_score:
                        hi_score=rel_score
                
        if pub_sentence!='':
            text=row['abstract'][0:1000]
            question='how many patients or cases were in the study, review or analysis?'
            sample=answer_question(question,text,model,tokenizer)
            sample=sample.replace("#", "")
            sample=sample.replace(" , ", ",")
            if sample=='19' or sample=='' or '[SEP]'in sample:
                sample='unk'
            if len(sample)>50:
                sample='unk'
            sample=sample.replace(" ", "")
            
            question='what type or kind of review or study was conducted?'
            design=answer_question(question,text,model,tokenizer)
            design=design.replace(" ##", "")
            if '[SEP]'in design or '[CLS]' in design or len(design)>75:
                design='unk'
            
            shorter = pub_sentence[0:1000]
            ### answer incubation questions
            incubation_period=answer_question('what is the mean incubation period age group',shorter,model,tokenizer)
            incubation_period=incubation_period.replace(" ##", "")
            incubation_period=incubation_period.replace(" 路 ", "路")
            incubation_period=incubation_period.replace(" . ", ".")
            incubation_range=answer_question('what is the incubation range age group',shorter,model,tokenizer)
            incubation_range=incubation_range.replace(" ##", "")
            incubation_range=incubation_range.replace(" 路 ", "路")
            incubation_range=incubation_range.replace(" . ", ".")
            
            authors=row["authors"].split(" ")
            link=row['doi']
            title=row["title"]
            score=hi_score
            journal=row["journal"]
            linka='https://doi.org/'+link
            linkb=title
            final_link='<p align="left"><a href="{}">{}</a></p>'.format(linka,linkb)
            #author_link='<p align="left"><a href="{}">{}</a></p>'.format(linka,authors[0]+' et al.')
            #sentence=pub_sentence+' '+author_link
            sentence=pub_sentence
            #sentence='<p fontsize=tiny" align="left">'+sentence+'</p>'
            to_append = [row['publish_time'],title,linka,journal,incubation_period,incubation_range,sample,design,sentence,score]
            df_length = len(df_table)
            df_table.loc[df_length] = to_append
    df_table=df_table.sort_values(by=['rel_score'], ascending=False)
    return df_table

def prepare_summary_answer(text,model):
    #model = pipeline(task="summarization")
    return model(text)

###### MAIN PROGRAM ######
# questions
search='incubation period age group'

# main focus keywords
focus='incubation period age group'

# process with spacy model and return df
df_table=process_question(df,search,focus)
    
display(HTML('<h2>'+search+'</h2>'))
display(HTML('<h5>*** Note: this table keeps the document excerpt and score for ease of review. A clean literature review formatted CSV file is avaiable in the data seciton wihtout those fields.</h5>'))
    
#convert df_table to html and display
df_table_show=HTML(df_table.to_html(escape=False,index=False))
display(df_table_show)

df_table=df_table.drop(columns=['excerpt', 'rel_score'])
df_table.to_csv('incubation_period_age.csv', index = False)

print ('done')