#Imports and spreadsheet setups

In [None]:
#Allows notebook to alter spreadsheet with all faq data
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

In [None]:
#Load spreadsheet and import common functions
import pandas as pd
from gspread_dataframe import set_with_dataframe #used to write dataframe into sheet
spreadsheet = gc.open_by_url("https://docs.google.com/spreadsheets/d/1FQJ9uCiJIwUHh7qYiRtTDisZHh-nuECZg_4strNqzLM/edit#gid=1405501365")

In [None]:
def checkExists(title):
   #check to see if sheet already exists
  exists = False
  for index in range(len(spreadsheet.worksheets())):
    if spreadsheet.get_worksheet(index).title == title:
      exists = True
  return exists

In [None]:
def setupNewSheet(title):
  #Creates question & answer columns    
  sheet = spreadsheet.add_worksheet(title, 1000, 26) #default size for spreadsheets
  sheet.update_cell(1,1,'Question')
  sheet.update_cell(1,2,'Answer')
  sheet.update_cell(1,3,'Context')
  return sheet #I don't know if it's possible to duplicate sheets or widen the columns

In [None]:
def exportToSheet(dictionary, sheet): #dict in form of {question:[answers,contexts]}
  row, column = 2, 1 
  for key in tqdm(dictionary):
    for value, context in dictionary[key]:
      time.sleep(1)
      sheet.update_cell(row, column, key)
      sheet.update_cell(row, column+1, value)
      sheet.update_cell(row, column+2, context)
      row += 1
  print('done')

In [None]:
#imports
!pip install -q requests
!pip install -q beautifulsoup4

import requests
from bs4 import BeautifulSoup
from pprint import pprint
import re
import time
from tqdm.notebook import tqdm

#Scraping FAQs

Each section outputs a sheet with questions, answers to said questions, and context to said answers (paragraph before and after??)

##NYT
  link: https://www.nytimes.com/interactive/2020/world/coronavirus-tips-advice.html

In [None]:
URL = "https://www.nytimes.com/interactive/2020/world/coronavirus-tips-advice.html"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
#Scrape the page into dictionary of questions and a paragraph of the answer
questionInfo = soup.find_all('div', {'class': 'g-question-wrap'})[:20]
QA = {} #dictionary of questions and their answers
for item in questionInfo:
  questionText = item.find('h3', {'itemprop': 'name'}).getText()
  answerParagraphs = item.find_all('p')[:-1] #last paragraph was a link to more resources
  answerText = []
  for index in range(len(answerParagraphs)):
    answer = answerParagraphs[index].getText()

    #retrieve context for answer
    context = ''
    if len(answerParagraphs) > 1:
      if index == 0:
        context = answerParagraphs[index+1].getText()
      else:
        context = answerParagraphs[index-1].getText()
    
    answerText.append((answer,context))
  QA[questionText] = answerText

In [None]:
QA

In [None]:
#load data into spreadsheet
title = 'NYT'
if not checkExists(title):
  NYT = setupNewSheet(title)
  exportToSheet(QA, NYT)

##

##CDC
  link: https://www.cdc.gov/coronavirus/2019-ncov/faq.html#Symptoms-&-Testing
  
  WAS MANUALLY GONE THROUGH AND FIXED AS MANY PROBLEMS SUCH AS BROKEN ANSWERS APPEARED
  

In [None]:
URL = "https://www.cdc.gov/coronavirus/2019-ncov/faq.html#Symptoms-&-Testing"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
#Scrape the page into dictionary of questions and a paragraph of the answer
questionInfo = soup.find_all('div', {'class':'card card-accordion '})
QA = {} #later used to store questions as keys and lists of paragraphs of answer to question as values
for qa in questionInfo:
  #find question text of each qa pair
  question = qa.find('div', {'class':'card-header collapsed'}).getText()
  #find answer text of each qa pair
  answers = []
  answerParagraphs = qa.find_all(['p','li'])
  for paragraph in answerParagraphs:
    text = paragraph.getText()
    #clean up answers because some are like "visit this website for more info"
    removeLinks = re.compile(r"(?<=\.).*(?=learn more|additional information|more information|website|see CDC's reccomendation)(.*)(\.)") #removes the sentences with these terms as they were the terms that were in sentences that only led to a website
    #clean up bullet point answers
    if re.search(r':|any of these', text):
      i = 1
      toBeRemoved = []
      try:
        while answerParagraphs.index(paragraph)+i <= len(answerParagraphs) and answerParagraphs[answerParagraphs.index(paragraph)+i].name == 'li':
          text += answerParagraphs[answerParagraphs.index(paragraph)+i].getText()
          toBeRemoved.append(answerParagraphs[answerParagraphs.index(paragraph)+i])
          i+=1
        for j in toBeRemoved:
          answerParagraphs.remove(j)
      except:
        print(answerParagraphs.index(paragraph)+i, len(answerParagraphs))

    #context
    context = ''
    if len(answerParagraphs)>1:
      index = answerParagraphs.index(paragraph)
      if index == 0:
        context = answerParagraphs[index+1].getText()
      else:
        context = answerParagraphs[index-1].getText()

    #TODO FIX PROBLEM WITH BULLETPOINTS, LINK BULLETPOINTS, AND ANSWERS WITH ONLY LINKS/FOR MORE INFORMATION...
    text = re.sub("\xa0", " ", text) #links in the answers would be ended by '\xa0' and this is to revert that
    text = re.sub(removeLinks, "", text)
    answers.append((text,context))
  QA[question] = answers

6 6
2 2
3 3
3 3


In [None]:
#Fix context
CDC = spreadsheet.worksheet('CDC')
QA = CDC.get_all_values()
QA_dict = {}
for question, answer, context in QA[1:]:

  answer = re.sub("\xa0", " ", answer)
  context = re.sub("\xa0", " ", context)

  if question in QA_dict.keys():
    QA_dict[question].append([answer, context])
    continue
  else:
    QA_dict[question] = [[answer, context]]

In [None]:
#fix context (makes last question have context as first answer and questions with one asnwer have their context be the answer)
for question in QA_dict:
  for ind, seq in enumerate(QA_dict[question]):
    answer = seq[0] 
    context = seq[1] 

    if context == '' and len(QA_dict[question]) == 1:
      # QA_dict[question][ind][1] = answer
      context = answer

    else:
      if ind != len(QA_dict[question])-1:
        context = QA_dict[question][ind+1][0]
      else:
        context = QA_dict[question][0][0]

QA_dict

{'Am I at risk for COVID-19 from mail, packages, or products?': [['There is still a lot that is unknown about COVID-19 and how it spreads. Coronaviruses are thought to be spread most often by respiratory droplets. Although the virus can survive for a short period on some surfaces, it is unlikely to be spread from domestic or international mail, products or packaging. However, it may be possible that people can get COVID-19 by touching a surface or object that has the virus on it and then touching their own mouth, nose, or possibly their eyes, but this is not thought to be the main way the virus spreads.',
   'There is still a lot that is unknown about COVID-19 and how it spreads. Coronaviruses are thought to be spread most often by respiratory droplets. Although the virus can survive for a short period on some surfaces, it is unlikely to be spread from domestic or international mail, products or packaging. However, it may be possible that people can get COVID-19 by touching a surface o

In [None]:
title = 'CDC COPY'
CDC_copy = spreadsheet.worksheet(title)
exportToSheet(QA_dict, CDC_copy)

HBox(children=(FloatProgress(value=0.0, max=106.0), HTML(value='')))


done


[['Question', 'Answer', 'Context'],
 ['What is a novel coronavirus?',
  'A novel coronavirus is a new coronavirus that has not been previously identified. The virus causing coronavirus disease 2019 (COVID-19), is not the same as the coronaviruses that commonly circulate among humans and cause mild illness, like the common cold.',
  'A novel coronavirus is a new coronavirus that has not been previously identified. The virus causing coronavirus disease 2019 (COVID-19), is not the same as the coronaviruses that commonly circulate among humans and cause mild illness, like the common cold.'],
 ['Why is the disease being called coronavirus disease 2019, COVID-19?',
  'On February 11, 2020 the World Health Organization announced an official name for the disease that is causing the 2019 novel coronavirus outbreak, first identified in Wuhan China. The new name of this disease is coronavirus disease 2019, abbreviated as COVID-19. In COVID-19, ‘CO’ stands for ‘corona,’ ‘VI’ for ‘virus,’ and ‘D’ f

In [None]:
#clean up QA to remove all non answers

In [None]:
questionInfo = soup.find_all('div', {'class':'card card-accordion '})
question = questionInfo[16].find('div', {'class':'card-header collapsed'}).getText()
answerParagraphs = questionInfo[16].find_all(['p','li'])

In [None]:
QA

In [None]:
CDC = spreadsheet.worksheet('CDC')
exportToSheet(QA, CDC)

done


In [None]:
print(answerParagraphs)



##UN

link:https://www.un.org/en/coronavirus/covid-19-faqs

In [None]:
URL = "https://www.hopkinsmedicine.org/health/conditions-and-diseases/coronavirus"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
QA = {}
#Add "what is COVID-19?"
what_is_covid_ans = "COVID-19 is the disease caused by the new coronavirus that emerged in China in December 2019. COVID-19 symptoms include cough, fever or chills, shortness of breath or difficulty breathing, muscle or body aches, sore throat, new loss of taste or smell, diarrhea, headache, fatigue, nausea or vomiting and congestion or runny nose. COVID-19 can be severe, and some cases have caused death. The new coronavirus can be spread from person to person. It is diagnosed with a laboratory test. There is no coronavirus vaccine yet. Prevention involves frequent hand-washing, coughing into the bend of your elbow, staying home when you are sick and wearing a cloth face covering if you can't practice physical distancing."
QA["What is COVID-19?"] = [[what_is_covid_ans, what_is_covid_ans]]

In [None]:
#Scrape the page into dictionary of questions and a paragraph of the answer
questionBlocks = soup.find_all('div', {'class': 'rtf'})
questionBlocks = [questionBlocks[i] for i in (1,3)]

#gather questions and their answer groups
for block in questionBlocks:
  for element in block.next_elements:
      
  
  # questionText = item.find('h3', {'itemprop': 'name'}).getText()
  # answerParagraphs = item.find_all('p')[:-1] #last paragraph was a link to more resources
  # answerText = []
  # for index in range(len(answerParagraphs)):
  #   answer = answerParagraphs[index].getText()

  #   #retrieve context for answer
  #   context = ''
  #   if len(answerParagraphs) > 1:
  #     if index == 0:
  #       context = answerParagraphs[index+1].getText()
  #     else:
  #       context = answerParagraphs[index-1].getText()
    
  #   answerText.append((answer,context))
  # QA[questionText] = answerText