# Covid-19 Chatbot

## Part I: Fine-Tuning Data

### 1. Load Packages

In [1]:
# data processing libraries
import numpy as np
import pandas as pd

In [2]:
# webscraping annd HTML parsing libraries
import requests
import re
from bs4 import BeautifulSoup

In [3]:
# other libraries
import time

### 2. URL & Link

In [4]:
# main url
URL = 'https://www.who.int'

# header for request
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}

# covid sublink
COVID_QA_SUBLINK = '/emergencies/diseases/novel-coronavirus-2019/question-and-answers-hub'

### 3. Web Scraping & HTML Parsing

#### 3.1 Extract covid-19 Q&A topic links

In [5]:
# function for extracting topic hyperlinks for an covid-19 QA

def topic_links():
    """
    This function returns all the topic hyperlinks of covid-19 QA. 
    """
    covid_link = URL + COVID_QA_SUBLINK
    html_content = requests.get(covid_link,headers=headers).text
    
    topic_pattern = r'<span class="trimmed">(.*)</span>'
    link_pattern = r'<a class="sf-list-vertical__item" href="(.*)">'
    topic_list = re.findall(topic_pattern,html_content)
    sublinks = re.findall(link_pattern,html_content)
    link_list = [URL + sublink for sublink in sublinks]
    
    df_topic = pd.DataFrame({'topic': topic_list, 'link': link_list})
    return df_topic

In [6]:
# dataframe of topic links
df_topic = topic_links()
df_topic.head()

Unnamed: 0,topic,link
0,Coronavirus disease (COVID-19),https://www.who.int/emergencies/diseases/novel...
1,Coronavirus disease (COVID-19): Adolescents an...,https://www.who.int/emergencies/diseases/novel...
2,Coronavirus disease (COVID-19): Breastfeeding,https://www.who.int/emergencies/diseases/novel...
3,Coronavirus disease (COVID-19): Casirivimab an...,https://www.who.int/emergencies/diseases/novel...
4,Coronavirus disease (COVID-19): Children and m...,https://www.who.int/emergencies/diseases/novel...


In [7]:
df_topic.shape

(56, 2)

In [8]:
df_topic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   topic   56 non-null     object
 1   link    56 non-null     object
dtypes: object(2)
memory usage: 1.0+ KB


#### 3.2 Extract Q&A from topic links

In [9]:
# function for extracting all qa's of a topic

def qa_extractor(topic_link):
    """
    This function returns QA pairs for a given topic 
    """
    topic_content = requests.get(topic_link,headers=headers).text
    topic_soup = BeautifulSoup(topic_content,'html.parser')

    questions = topic_soup.find_all('div', class_= 'sf-accordion__trigger-panel')
    question_list = [question.text.strip() for question in questions]
    
    answers = topic_soup.find_all('div', class_= 'sf-accordion__content')
    answer_list = []
    for answer in answers:
        answer = answer.find_all(text=True)
        answer = [text.replace('\n',' ') for text in answer if text not in ['\n','\xa0']]
        answer = [text.replace('\xa0',' ') for text in answer]
        answer_list.append(' '.join([text for text in answer]))


    df_qa = pd.DataFrame({'question': question_list, 'answer': answer_list})
    return df_qa

In [10]:
# creating the Q&A dataframe
link_list = list(df_topic['link'])
df = pd.concat([qa_extractor(link) for link in link_list], ignore_index=True)

In [11]:
df.head()

Unnamed: 0,question,answer
0,What is COVID-19?,COVID-19 is the disease caused by a new corona...
1,What are the symptoms of COVID-19?,The most common symptoms of COVID-19 are Fever...
2,What happens to people who get COVID-19?,"Among those who develop symptoms, most (about ..."
3,Who is most at risk of severe illness from COV...,"People aged 60 years and over, and those with ..."
4,Are there long-term effects of COVID-19?,"Some people who have had COVID-19, whether the..."


In [12]:
df.shape

(516, 2)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  516 non-null    object
 1   answer    516 non-null    object
dtypes: object(2)
memory usage: 8.2+ KB


### 4. Data Cleaning

In [14]:
# unclean text (some text remains partially unparsed, and contain ><)
df_unclean = df[df['answer'].apply(lambda x: '><' in x)]
df_unclean

Unnamed: 0,question,answer
330,What is physical activity?,[if gte mso 9]><xml> <w:WordDocument> <w:Vi...
331,Why do we need it?,[if gte mso 9]><xml> <w:WordDocument> <w:Vi...


In [15]:
# clean text
for index in list(df_unclean.index):
    df.loc[index,'answer'] = df.iloc[index]['answer'].split('[endif]')[-1].strip()
    
# check cleaning
n_unlcean = df[df['answer'].apply(lambda x: '><' in x)].shape[0]
print(f'number of unclean texts: {n_unlcean}')

number of unclean texts: 0


### 5. Data Resize

In [16]:
# token calculation
df['token'] = df.apply(lambda x: int(len((x['question']+' '+x['answer']).split(' '))/0.75),axis=1)
df.head()

Unnamed: 0,question,answer,token
0,What is COVID-19?,COVID-19 is the disease caused by a new corona...,57
1,What are the symptoms of COVID-19?,The most common symptoms of COVID-19 are Fever...,228
2,What happens to people who get COVID-19?,"Among those who develop symptoms, most (about ...",116
3,Who is most at risk of severe illness from COV...,"People aged 60 years and over, and those with ...",80
4,Are there long-term effects of COVID-19?,"Some people who have had COVID-19, whether the...",133


In [17]:
# large tokens
df[df['token']>1000]

Unnamed: 0,question,answer,token
26,COVID-19 makes me really anxious. I feel worri...,In situations like a pandemic it is very norma...,1117
207,Are people living with HIV at increased risk o...,People living with HIV (PLHIV) who are not tak...,1010
217,Someone in my household tested positive for CO...,It’s hard when someone close to you is unwell....,1128


In [18]:
# resize case by case

# website reference removed
df.loc[26,'answer'] = df.iloc[26]['answer'].split('Find out more on our')[0].strip()

# additional information about WHO platform removed 
df.loc[207,'answer'] = df.iloc[207]['answer'].split('For this purpose, WHO has')[0].strip()

# additional information about virus spread removed 
df.loc[217,'answer'] = df.iloc[217]['answer'].split('The virus can also spread')[0].strip()

In [19]:
# token recalculation
df['token'] = df.apply(lambda x: int(len((x['question']+' '+x['answer']).split(' '))/0.75),axis=1)
max_token = df['token'].max()
print(f'maximum token: {max_token}')

maximum token: 996


### 6. Cost Estimate

In [20]:
# cost calculation
tuning_cost = round(4*(df['token'].sum())*(0.006/1000),2)
print(f'cost: {tuning_cost}$')

cost: 2.63$


### 7. Save Data

In [21]:
# create dataframe
df_tuning = df.drop('token',axis=1)
df_tuning.rename(columns={'question':'prompt', 'answer':'completion'}, inplace=True)

# add an extra space before completion 
df_tuning['completion'] = df_tuning['completion'].apply(lambda text: ' '+text)

In [22]:
df_tuning.head()

Unnamed: 0,prompt,completion
0,What is COVID-19?,COVID-19 is the disease caused by a new coron...
1,What are the symptoms of COVID-19?,The most common symptoms of COVID-19 are Feve...
2,What happens to people who get COVID-19?,"Among those who develop symptoms, most (about..."
3,Who is most at risk of severe illness from COV...,"People aged 60 years and over, and those with..."
4,Are there long-term effects of COVID-19?,"Some people who have had COVID-19, whether th..."


In [23]:
df_tuning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   prompt      516 non-null    object
 1   completion  516 non-null    object
dtypes: object(2)
memory usage: 8.2+ KB


In [24]:
# save a csv file
df_tuning.to_csv('covid19_qa.csv',index=False)