# Scraper for collecting Belgium deaths
This is a program that scrapes the reported deaths in Belgium publically available on inmemoriam.be. This scraper was made to get an idea of the impact of COVID-19 on the Belgian death rate.
## 1. Background

## 2. Necessary packages

In [60]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import datetime as dt
import matplotlib.pyplot as plt

## 3. Scraper
### 3.1 Set parameters

In [156]:
current_page = 1
begin_date = "2016-03-01"
end_date = "2016-03-31"
url = "https://www.inmemoriam.be/nl/rouwberichten/?page=" + str(current_page) + "&filter=&periodStart=" + str(begin_date) + "&periodEnd=" + str(end_date) + "&yearOfBirth=&undertakerId=&placeOfResidence=&provinceId=&newsPaper=&obituary=1"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

page_list = soup.find_all('a', class_ = 'c-pagination__item')

pages = []

for page in page_list:
    page = page.get_text()
    pages.append(page)

end_page = pages[-2]

### 3.2 Scrape the data

In [157]:
df = pd.DataFrame()
df["name"] = []
df["age"] = []
df["date"] = []
df["location"] = []
page_list = np.arange(current_page,int(end_page) + 1)
print("Total amount of pages is: ", end_page)

for num in page_list:
    print("Page ", num, " is being scraped now.")
    tempdf = pd.DataFrame()
    url = "https://www.inmemoriam.be/nl/rouwberichten/?page=" + str(num) + "&filter=&periodStart=" + str(begin_date) + "&periodEnd=" + str(end_date) + "&yearOfBirth=&undertakerId=&placeOfResidence=&provinceId=&newsPaper=&obituary=1"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    name_list = soup.find_all('h3', class_ = 'c-deceased__name')
    age_list = soup.find_all('span', class_ = 'c-deceased__age')
    date_list = soup.find_all('div', class_ = 'c-deceased__departed')
    location_list = soup.find_all('div', class_ = 'c-deceased__location')
    
    names = []
    ages = []
    dates = []
    locations = []

    for name in name_list:
        name = name.get_text()
        names.append(name)

    for age in age_list:
        age = age.get_text()
        age = int(age[0:-5])
        ages.append(age)

    for location in location_list:
        location = location.get_text()
        locations.append(location)

    for date in date_list:
        date = date.get_text()
        date = date[11:-1]
        date = dt.datetime.strptime(date, "%d/%m/%Y")
        dates.append(date)
        
    ages = Series(ages)
    locations = Series(locations)
    dates = Series(dates)
    names = Series(names)
    
    tempdf["name"] = names
    tempdf["age"] = ages
    tempdf["date"] = dates
    tempdf["location"] = locations
    
    df = df.append(tempdf)

print("Done.")

Total amount of pages is:  114
Page  1  is being scraped now.
Page  2  is being scraped now.
Page  3  is being scraped now.
Page  4  is being scraped now.
Page  5  is being scraped now.
Page  6  is being scraped now.
Page  7  is being scraped now.
Page  8  is being scraped now.
Page  9  is being scraped now.
Page  10  is being scraped now.
Page  11  is being scraped now.
Page  12  is being scraped now.
Page  13  is being scraped now.
Page  14  is being scraped now.
Page  15  is being scraped now.
Page  16  is being scraped now.
Page  17  is being scraped now.
Page  18  is being scraped now.
Page  19  is being scraped now.
Page  20  is being scraped now.
Page  21  is being scraped now.
Page  22  is being scraped now.
Page  23  is being scraped now.
Page  24  is being scraped now.
Page  25  is being scraped now.
Page  26  is being scraped now.
Page  27  is being scraped now.
Page  28  is being scraped now.
Page  29  is being scraped now.
Page  30  is being scraped now.
Page  31  is being

## 5. Clean dataset
### 5.1 Reshape

In [158]:
df.reset_index(inplace = True, drop = True)
df = df[::-1].reset_index(drop = True)
df.shape

(1368, 4)

### 5.2 Reformat

In [159]:
dates = df["date"]

months = []
weeks = []
days = []

for date in dates:
    month = dt.datetime.strftime(date, "%B")
    week = dt.datetime.strftime(date, "%W")
    day = dt.datetime.strftime(date, "%j")
    months.append(month)
    weeks.append(week)
    days.append(day)
    
df["month"] = months
df["week"] = weeks
df["day"] = days

df['location']= pd.Series(df['location'], dtype="string")
df['month']= pd.Series(df['month'], dtype="string")
df['week']= pd.Series(df['week'], dtype="string")
df['name']= pd.Series(df['name'], dtype="string")
df['day']= pd.Series(df['day'], dtype="string")

booler = df["month"] == "March"
df = df.loc[booler,]

df.head()

# bool = df["name"] == "Sandrine REULIAUX"
# print(df["date"].value_counts().head(20))
# print(df.loc[bool,])

Unnamed: 0,name,age,date,location,month,week,day
0,Maria Winters,82.0,2016-03-01,LOMMEL,March,9,61
1,Albert WINDELS,95.0,2016-03-01,Ingooigem,March,9,61
2,Yvonne Wijnants,90.0,2016-03-01,ZONHOVEN,March,9,61
3,LAOUREUX Victorine,87.0,2016-03-01,Stembert,March,9,61
4,Philippe VANDENHOVEN,59.0,2016-03-01,Bastogne,March,9,61


In [160]:
df.to_csv("march_2016.csv")