-
Notifications
You must be signed in to change notification settings - Fork 7
/
polls.Rmd
88 lines (72 loc) · 2.95 KB
/
polls.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
---
title: "Recipe: visualize trends in electoral polls"
author: Samuel Goëta
output: html_notebook
---
# Define
- issue with surveys: blurring signals, we want to know the trends rather than having a daily update on where each polling locates
---
# Find
---
# Get
```{r}
#install.packages("rvest")
library(rvest)
polls <- read_html("https://en.wikipedia.org/wiki/Opinion_polling_for_the_French_presidential_election,_2017")
polls_round1 <- polls %>%
# html_node("#mw-content-text > table:nth-child(30)") %>%
html_node(xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "wikitable", " " ))]') %>%
html_table(header = TRUE,fill=TRUE)
```
---
# Verify
---
# Clean
```{r}
polls_round1 <- polls_round1[-c(1,2, 3, 4, 5, 94), ] #virer les lignes inutiles
colnames(polls_round1) <- c("Poll source","Fieldwork date", "Sample size", "Abstention", "Arthaud", "Poutou", "Mélenchon", "Hamon", "Macron", "Lassalle","Fillon", "Dupont-Aignan", "Asselineau","Le Pen", "Cheminade") #enlever les partis des noms de colonnes
save(polls_round1, file = "./polls_round1.rdata") #enregistrer parce que ça va planter
load(file = "./polls_round1.rdata")
library(tidyverse)
polls_round1 <- polls_round1 %>% mutate_at(vars(`Abstention`:`Cheminade`),funs(parse_number),na=character()) #enlever les pourcentages et absention 0
#install.packages("lubridate")
#install.packages("rex")
library(lubridate)
library(rex)
rex_mode() #charge les éléments de rex en mémoire (voir aide)
get_date <- rex( #produit la regex à partir d'un langage lisible et compréhensible
maybe(numbers %>% between(1,2), #pas obligé de l'avoir, peut etre entre un et deux nombres puis un tiret
"–"),
capture(
name = "date", #groupe qu'on veut récupérer
numbers %>% between(1,2), #entre un et deux nombres
space, #puis un espace
letters %>% n_times(3), #3 lettres pour le mois
space,
"20", one_of(1:5), number #année
)
)
date <-re_matches(polls_round1$`Fieldwork date`, get_date)$date #crée le vecteur date
polls_round1<- polls_round1 %>%
mutate(date = date %>% dmy()) #ajoute la colonne
polls_round1_long <- polls_round1 %>% gather(Candidates, Percentage, Abstention:Cheminade) #gather
save(polls_round1_long, file = "./polls_round1_long.Rdata")
```
---
# Visualize
devtools::install_github("hrbrmstr/hrbrthemes")
library(hrbrthemes)
polls_round1_long %>%
filter(polls_round1_long$Candidates!="Abstention") %>%
ggplot() +
geom_point(mapping = aes(x=date,y=Percentage,color = Candidates, alpha = "0.7")) + #
geom_smooth(aes(date,y=Percentage,color = Candidates))+
labs(x = "Date of poll",
y = "Percentage of candidate in poll",
title = "Polls during the 2017 Presidential election in France",
subtitle = "Evolution of polls during the month preceding the first round of the 2017 Presidential election in France",
caption = "Source : Wikipedia")+
theme_ipsum() +
theme(legend.position="right") #reminder, keep track: do it after +
---
# Present