In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime

In [2]:
url = "https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_Lebanon"
r = requests.get(url)
html_content = r.text

In [3]:
html_soup = BeautifulSoup(html_content, "html.parser")
tables = html_soup.findAll('table', attrs = {"class" : "wikitable"})
headers = tables[0].findAll('th')
headers

[<th style="white-space:nowrap;">.<br/>Date<br/>.<br/>.
 </th>,
 <th>Cases
 </th>,
 <th>Deaths
 </th>,
 <th>Recoveries
 </th>]

In [4]:
#I will get the headers of the table
columns = []
for i in headers:
    columns.append(i.text.strip('\n')) # i.text contains '\n' so with split all '\n' are removed 
    
del columns[0] # since the first header don't have a column so I will delete it
columns  #now we have list of the headers

['Cases', 'Deaths', 'Recoveries']

In [5]:
# get the records of the table
body = tables[1].findAll('tr') # get the 5 rows (rows because the table is rotated to the right)
Date = body[0].findAll('td')

for i in range(0,len(Date)):
    Date[i] = Date[i].text[0:-1]         #make a list of all dates

Cases = body[1].text.split()             #make a list of all cases

Deaths = body[2].text.split()            #make a list of all deaths

Recoveries = body[3].text.split()        #make a list of all recoveries

if len(Cases) == len(Date) == len(Deaths) == len(Recoveries):
    print("There are no missing values")
    
# the last row is only the origin of the dataset

There are no missing values


### Build the dataframe

In [6]:
df = pd.DataFrame({ 'Dates': Date,'Cases':Cases, 'Deaths':Deaths, 'Recoveries': Recoveries })
df.index = df['Dates']
df

Unnamed: 0_level_0,Dates,Cases,Deaths,Recoveries
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21 Feb 2020,21 Feb 2020,1,0,0
26 Feb 2020,26 Feb 2020,2,0,0
28 Feb 2020,28 Feb 2020,4,0,0
29 Feb 2020,29 Feb 2020,7,0,0
01 Mar 2020,01 Mar 2020,10,0,0
...,...,...,...,...
31 May 2020,31 May 2020,1220,27,712
01 June 2020,01 June 2020,1233,27,715
02 June 2020,02 June 2020,1242,27,719
03 June 2020,03 June 2020,1256,27,724


### Convert the index to datetime index

In [7]:
df.index = pd.DatetimeIndex(df.index)
df.index

DatetimeIndex(['2020-02-21', '2020-02-26', '2020-02-28', '2020-02-29',
               '2020-03-01', '2020-03-02', '2020-03-04', '2020-03-06',
               '2020-03-07', '2020-03-08', '2020-03-09', '2020-03-10',
               '2020-03-11', '2020-03-12', '2020-03-13', '2020-03-14',
               '2020-03-15', '2020-03-16', '2020-03-17', '2020-03-18',
               '2020-03-19', '2020-03-20', '2020-03-21', '2020-03-22',
               '2020-03-23', '2020-03-24', '2020-03-25', '2020-03-26',
               '2020-03-27', '2020-03-28', '2020-03-29', '2020-03-30',
               '2020-03-31', '2020-04-01', '2020-04-02', '2020-04-03',
               '2020-04-04', '2020-04-05', '2020-04-06', '2020-04-07',
               '2020-04-08', '2020-04-09', '2020-04-10', '2020-04-11',
               '2020-04-12', '2020-04-13', '2020-04-14', '2020-04-15',
               '2020-04-16', '2020-04-17', '2020-04-18', '2020-04-19',
               '2020-04-20', '2020-04-21', '2020-04-22', '2020-04-23',
      

### Save the dataframe to a csv file

In [8]:
df.to_csv('Corona-Lebanon.csv', index=False, encoding='utf-8')