# Collect data about Covid-19
This notebook intend to collect data about the Covid-19 to illustrate how we can access the web programmatically. 

In [1]:
import os
import pandas as pd
from urllib.request import urlretrieve, urlopen
from datetime import datetime
import math
import re
import numpy as np
from bs4 import BeautifulSoup

data_destination_path = "D:\\workspaces\\windows\\3il\\bigdata\\data"

### Download a csv file

In [2]:
covid_csv_source_url = 'https://www.data.gouv.fr/en/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7'
covid_csv_destination_path = os.path.join(data_destination_path, 'covid.csv')

#### Download the file from the internet

In [3]:
urlretrieve(covid_csv_source_url,covid_csv_destination_path)

('D:\\workspaces\\windows\\3il\\bigdata\\data\\covid.csv',
 <http.client.HTTPMessage at 0x2329b9f8148>)

#### Display file informations

In [4]:
info = os.stat(covid_csv_destination_path)
print(info)

os.stat_result(st_mode=33206, st_ino=1688849860286953, st_dev=1948972989, st_nlink=1, st_uid=0, st_gid=0, st_size=3466241, st_atime=1613036918, st_mtime=1613036918, st_ctime=1612960757)


In [5]:
print("size (Mo): ", round(info.st_size/1000000, 3))
print("last_update: ", datetime.fromtimestamp(info.st_ctime))

size (Mo):  3.466
last_update:  2021-02-10 13:39:17.874512


#### Explore the file

In [6]:
df = pd.read_csv(covid_csv_destination_path, sep=";", nrows=1000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   dep     1000 non-null   object
 1   sexe    1000 non-null   int64 
 2   jour    1000 non-null   object
 3   hosp    1000 non-null   int64 
 4   rea     1000 non-null   int64 
 5   rad     1000 non-null   int64 
 6   dc      1000 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 54.8+ KB


#### Make some queries on data

In [7]:
df.groupby(['jour'])['rad'].sum()

jour
2020-03-18    1627
2020-03-19    2322
2020-03-20    3128
2020-03-21     820
Name: rad, dtype: int64

### Download an HTML page

In [8]:
asimov_html_source_url = 'https://en.wikipedia.org/wiki/Isaac_Asimov'
asimov_html_destination_path = os.path.join(data_destination_path, 'asimov.html')

#### Download the page

In [9]:
urlretrieve(asimov_html_source_url,asimov_html_destination_path)

('D:\\workspaces\\windows\\3il\\bigdata\\data\\asimov.html',
 <http.client.HTTPMessage at 0x2329b9ded48>)

#### Display the file informations

In [10]:
info = os.stat(asimov_html_destination_path)
print(info)

os.stat_result(st_mode=33206, st_ino=3940649673976566, st_dev=1948972989, st_nlink=1, st_uid=0, st_gid=0, st_size=723513, st_atime=1613036919, st_mtime=1613036919, st_ctime=1612965066)


In [11]:
print("size (Mo): ", round(info.st_size/1000000, 3))
print("last_update: ", datetime.fromtimestamp(info.st_mtime))

size (Mo):  0.724
last_update:  2021-02-11 10:48:39.557185


#### Explore the file

In [12]:
with open(asimov_html_destination_path, 'r', encoding="utf8") as asimov_html:
    print(asimov_html.readlines())

['<!DOCTYPE html>\n', '<html class="client-nojs" lang="en" dir="ltr">\n', '<head>\n', '<meta charset="UTF-8"/>\n', '<title>Isaac Asimov - Wikipedia</title>\n', '<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YCR1ZeEGi8gun7XyOqxR-gAAAIk","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Isaac_Asimov","wgTitle":"Isaac Asimov","wgCurRevisionId":1006084263,"wgRevisionId":1006084263,"wgArticleId":14573,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages containing links to subscription-only content","CS1 errors: missing periodical","Webarchive template wayback links","Articles with short description

#### Make some queries

Let's start with a simple search of word occurrences. 

In [13]:
science_regex = re.compile('science', re.IGNORECASE)
fiction_regex = re.compile('fiction', re.IGNORECASE)
computer_regex = re.compile('computer', re.IGNORECASE)
robot_regex = re.compile('robot', re.IGNORECASE)

with open(asimov_html_destination_path, 'r', encoding="utf8") as asimov_html:
    content = asimov_html.readlines()    
    science_count = np.array([len(science_regex.findall(line)) for line in content]).sum()
    fiction_count = np.array([len(fiction_regex.findall(line)) for line in content]).sum()
    computer_count = np.array([len(computer_regex.findall(line)) for line in content]).sum()
    robot_count = np.array([len(robot_regex.findall(line)) for line in content]).sum()
    
    print("science_count:", science_count)
    print("fiction_count:", fiction_count)
    print("computer_count:", computer_count)
    print("robot_count:", robot_count)

science_count: 645
fiction_count: 648
computer_count: 15
robot_count: 247


## Browse HTML page

In [14]:
covid_gouv_url = "https://www.data.gouv.fr/fr/datasets/donnees-hospitalieres-relatives-a-lepidemie-de-covid-19"
destination_extracted_covid_csv_path = os.path.join(data_destination_path, 'extracted_covid.csv')

#### Download the page

In [15]:
page = urlopen(covid_gouv_url)

#### Read the DOM

In [16]:
soup = BeautifulSoup(page, "html.parser")

#### Search for the article

In [17]:
h4 = [elt for elt in soup.findAll("h4") if elt.getText().startswith("donnees-hospitalieres-covid19-2")][0]
article = h4.find_parent('article')

#### Extract the link

In [18]:
for a in article.findAll('a'):
    if a.getText() == "Télécharger":
        link = a["href"]

In [19]:
print("link", link)

link https://www.data.gouv.fr/fr/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7


#### Download the file from the link

In [20]:
urlretrieve(link,destination_extracted_covid_csv_path)

('D:\\workspaces\\windows\\3il\\bigdata\\data\\extracted_covid.csv',
 <http.client.HTTPMessage at 0x2329c0e3b08>)

#### Display downloaded file informations

In [21]:
info = os.stat(destination_extracted_covid_csv_path)
print("size (Mo): ", round(info.st_size/1000000, 3))
print("last_update: ", datetime.fromtimestamp(info.st_ctime))

size (Mo):  3.466
last_update:  2021-02-10 18:06:55.606231
