# Web scraping :: Communiqués du conseil fédéral

Source: https://www.admin.ch/gov/fr/accueil/documentation/communiques/communiques-conseil-federal.html

Dates: Feb 2020  - September 2020

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup

In [5]:
# Flux RSS

url_1 = 'https://www.newsd.admin.ch/newsd/feeds/rss?lang=fr&org-nr=1070&topic=0&keyword=&offer-nr=&catalogueElement=&kind=&start_date=2019-11-01&end_date=2020-09-16&end_index=199'
url_2 = 'https://www.newsd.admin.ch/newsd/feeds/rss?lang=fr&org-nr=1070&topic=0&keyword=&offer-nr=&catalogueElement=&kind=&start_date=2015-01-01&end_date=2020-05-15&end_index=199'
url_3 = 'https://www.newsd.admin.ch/newsd/feeds/rss?lang=fr&org-nr=1070&topic=0&keyword=&offer-nr=&catalogueElement=&kind=&start_date=2015-01-01&end_date=2020-01-21&end_index=199'
url_4 = 'https://www.newsd.admin.ch/newsd/feeds/rss?lang=fr&org-nr=1070&topic=&keyword=&offer-nr=&catalogueElement=&kind=&start_date=2015-01-01&end_date=&end_index=199'

list_flux_rss = [url_1,url_2,url_3,url_4]

In [6]:
# Function that gets the title, url and  publication date
# for all news releases

def get_info(flux_rss):
    
    page = requests.get(flux_rss)
    soup = BeautifulSoup(page.text,'html.parser')
    
    # Create  empty lists
    list_titles  = []
    list_urls    = []
    list_dates   = []

    for item in soup.find_all('item'):
        title_str = item.contents[1].contents[0].strip()
        if 'corona' in title_str.lower():
            list_titles.append(title_str)
            list_urls.append(item.contents[4].strip())
            list_dates.append(item.contents[7].contents[0])
            
    return list_titles,list_urls,list_dates

In [7]:
titles_tmp  = []
urls_tmp    = []
dates_tmp   = []
results_t = pd.DataFrame()

for rss in list_flux_rss:

    titles_tmp  = []
    urls_tmp    = []
    dates_tmp   = []
    titles_tmp, urls_tmp, dates_tmp = get_info(rss)
    results = pd.DataFrame({'title':titles_tmp,'url':urls_tmp,'pub_date':dates_tmp})
    results_t = pd.concat([results_t,results])
    
# Reset index
results_t.reset_index(drop=True,inplace=True)
# Drop duplicated entries
results_t.drop_duplicates(inplace=True)

# Fix datatypes

results_t['pub_date'] = results_t['pub_date'].astype('str')
results_t['pub_date'] =  pd.to_datetime(results_t['pub_date'], infer_datetime_format=True)
results_t.dropna(inplace=True)
print(results_t.shape)
results_t.head()

(136, 3)


Unnamed: 0,title,url,pub_date
0,Coronavirus: l'allocation Corona-perte de gain...,https://www.admin.ch/gov/fr/accueil/documentat...,2020-09-11
1,Coronavirus : pas de quarantaine pour les pers...,https://www.admin.ch/gov/fr/accueil/documentat...,2020-09-11
2,Coronavirus : le Conseil fédéral prolonge la s...,https://www.admin.ch/gov/fr/accueil/documentat...,2020-09-11
3,Coronavirus : les grandes manifestations à nou...,https://www.admin.ch/gov/fr/accueil/documentat...,2020-09-02
4,Coronavirus : la Confédération et les cantons ...,https://www.admin.ch/gov/fr/accueil/documentat...,2020-09-02


So, now we have the title, url and publication data of all news releases. We would now like to collect the text of the news release and add them in a "text" column.

In [8]:
# get_text: collects the text of the news release

def get_text(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,'html.parser')

    text = ' '.join([tag.text.strip() for tag in soup.find_all('p')][1:-5]) # Remove strings at top/bottom of release  
    return text

In [9]:
%%time

# Collect text of all news releases related to coronavirus
# Add them to column "text"

results_t['text'] = results_t['url'].apply(lambda x:get_text(x))
# Wall time: 49.4 s

CPU times: user 7.79 s, sys: 184 ms, total: 7.98 s
Wall time: 44.5 s


In [12]:
print(results_t.shape)
results_t.drop_duplicates(inplace=True)
print(results_t.shape)

(136, 4)
(136, 4)


In [13]:
# Show results

results_t.sample(5)

Unnamed: 0,title,url,pub_date,text
36,Coronavirus : la Suisse rouvrira ses frontière...,https://www.admin.ch/gov/fr/accueil/documentat...,2020-06-05,"Berne, 05.06.2020 - La conseillère fédérale Ka..."
112,Coronavirus : le Conseil fédéral décide de pre...,https://www.admin.ch/gov/fr/accueil/documentat...,2021-02-24,"Berne, 24.02.2021 - Dès lundi 1er mars 2021, l..."
81,Coronavirus : le Conseil fédéral lance un prog...,https://www.admin.ch/gov/fr/accueil/documentat...,2020-04-16,"Berne, 16.04.2020 - Un besoin urgent de recher..."
37,Coronavirus: assouplissements temporaires dans...,https://www.admin.ch/gov/fr/accueil/documentat...,2020-06-05,"Berne, 05.06.2020 - En raison de la pandémie d..."
70,Coronavirus : le Conseil fédéral adopte le mes...,https://www.admin.ch/gov/fr/accueil/documentat...,2020-04-22,"Berne, 22.04.2020 - Au mois de mars, le Consei..."


In [14]:
# Export results

results_t.to_csv('communiques_conseil_fédéral.csv',index=False)