# Récupération des notes données aux arrondissements de Paris sur ville-ideale.fr 

'''We want to scrap the marks that citizens of Paris have given to their arrondissement in the website ville-ideale.fr . The marks are out of 10 and the global mark is :(mean of 8 criterias + quality of life criteria) / 2  . '''

In [222]:
'''Importation of webscraping and datascience packages'''
#!pip install -q lxml
import numpy as np
import bs4 #BeautifulSoup4 for scraping
import lxml
import pandas #datascience library
import urllib #scraping
import time
import requests #scraping
import random
import yaml #to use yml files
from urllib import request #to get the html source code

In [225]:
'''We use a user-agent and other headers also to avoid being detected as a robot by the website'''
'''We open the file headers in which we have different headers (for Chrome,Firefox...) and save them in browser_headers'''
with open("headers.yml") as f_headers:
    browser_headers = yaml.safe_load(f_headers)

In [228]:
'''We first create lists for name of arrondissement and for postal codes to find the corresponding web page on the website ville-ideale.fr'''
arrondissements_list= ["1er"]+[str(i) + "e" for i in range(2,21)]
postal_codes_list=["751"+"0"+ str(i) if i in range(1,10) else "751"+str(i) for i in range(1,21)]


In [229]:
'''We create a list of the url of the web pages of each arrondissement on the website ville-ideale.fr'''
arrondissements_table=[[arrondissements_list[i],postal_codes_list[i]] for i in range(20)]
urls=["https://www.ville-ideale.fr/paris-"+arr[0]+"-arrondissement_"+ arr[1] for arr in arrondissements_table]
'''We initialise the final DataFrame that will contain all the marks for all arrondissements'''
df_marks_arrondissements=pandas.DataFrame()


In [231]:
'''We scrap the marks for each arrondissement thanks to a loop on the page urls of each arrondissement.
This code takes about one hour to run because of the several time.sleep() but it is necessary in order to not be blocked by the website'''
for i in range(20):
    headers=list(browser_headers.values())[random.randint(0,len(list(browser_headers.values()))-1)]
    '''Getting the source code of the arrondissement page'''
    url_arr = urls[i] #url of the arrondissement page to scrap
    request_text = requests.get(url_arr,headers=headers).text #getting the html source code
    time.sleep(30)
    '''Create a Python object from the web page with BeautifulSoup4 to facilitate html data retrieval'''
    page_arr = bs4.BeautifulSoup(request_text, "lxml")
    time.sleep(20)
    '''Getting the global mark of the arrondissement'''
    global_mark = page_arr.find('p', {'id' : 'ng'})
    time.sleep(40)
    '''Getting the table of marks for the 9 criterias + the global mark as a dataframe of one row and 10 columns'''
    table_marks = page_arr.find('table', {'id' : 'tablonotes'})
    time.sleep(60)
    rows_marks = table_marks.find_all('tr') #Getting the rows of the table
    time.sleep(20)
    df_marks=pandas.DataFrame()
    #We will create two lists: one for criterias and one for corresponding marks using each row of the table
    criteria_list=[]
    marks_list=[]
    for row in rows_marks:
        criteria_list.append(row.find('th').text.strip())
        marks_list.append(row.find('td').text.strip())
    
    df_marks=pandas.DataFrame(marks_list).transpose() #We create a 1 row dataframe with the marks for the 9 criterias
    df_marks.columns=criteria_list #We assign the names of the criterias to the columns
    df_marks["Note Globale"]=global_mark.text.strip()[:4] #We add the global mark (one column more)
    df_marks.index=["Paris "+arrondissements_list[i]] #We give the row the name of the arrondissement
    df_marks_arrondissements=pandas.concat([df_marks_arrondissements,df_marks])# We finally add this line to the final DataFrame
    time.sleep(random.randint(1,5))

## Nous obtenons ce tableau final des notes selon l'arrondissement et le critère :

In [232]:
'''We obtain this final table of marks for all arrondissements'''
df_marks_arrondissements

Unnamed: 0,Environnement,Transports,Sécurité,Santé,Sports et loisirs,Culture,Enseignement,Commerces,Qualité de vie,Note Globale
Paris 1er,450,933,525,650,600,858,617,825,633,658
Paris 2e,313,817,638,725,463,800,567,846,600,623
Paris 3e,552,819,793,793,678,933,781,881,752,765
Paris 4e,427,762,669,646,512,746,712,677,573,609
Paris 5e,730,852,818,855,700,850,895,841,845,832
Paris 6e,625,860,790,810,590,840,855,820,755,765
Paris 7e,688,760,796,676,560,724,752,680,756,730
Paris 8e,678,900,783,750,550,744,767,794,806,776
Paris 9e,514,850,624,762,617,850,724,855,726,725
Paris 10e,324,763,374,660,575,618,553,651,454,510


In [239]:
'''We save this dataframe in a excel file "Notes_arrondissements" '''
name="Notes_arrondissements.xlsx"
df_marks_arrondissements.to_excel(name)