forked from InseeFrLab/R-Insee-Data
/
get_insee_idbank.R
178 lines (154 loc) · 6.32 KB
/
get_insee_idbank.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#' Get data from INSEE series idbank
#'
#' @details Get data from INSEE series idbanks.
#' The user can disable the download display in the console with the following command :
#' Sys.setenv(INSEE_download_verbose = "FALSE")
#'
#' @param ... one or several series key (idbank)
#' @param startPeriod start date of data
#' @param endPeriod end date of data
#' @param firstNObservations get the first N observations for each key series (idbank)
#' @param lastNObservations get the last N observations for each key series (idbank)
#' @param includeHistory boolean to access the previous releases (not available on all series)
#' @param updatedAfter starting point for querying the previous releases (format yyyy-mm-ddThh:mm:ss)
#' @param limit by default, the function get_insee_idbank has a 1200-idbank limit.
#' Set limit argument to FALSE to ignore the limit or modify the limit with the following command : Sys.setenv(INSEE_idbank_limit = 1200)
#' @return a tibble with the data
#' @examples
#' \donttest{
#'
#' #example 1 : import price index of industrial products and turnover index : manufacture of wood
#' data = get_insee_idbank("001558315", "010540726")
#'
#' #example 2 : unemployment data
#'
#' library(tidyverse)
#'
#' df_idbank_list_selected =
#' get_idbank_list("CHOMAGE-TRIM-NATIONAL") %>% #unemployment dataset
#' filter(SEXE == 0) %>% #men and women
#' add_insee_title()
#'
#' idbank_list_selected = df_idbank_list_selected %>% pull(idbank)
#'
#' unem = get_insee_idbank(idbank_list_selected)
#'
#' #example 3 : French GDP growth rate
#'
#' library(tidyverse)
#'
#' df_idbank_list_selected =
#' get_idbank_list("CNT-2014-PIB-EQB-RF") %>% # Gross domestic product balance
#' filter(FREQ == "T") %>% #quarter
#' filter(OPERATION == "PIB") %>% #GDP
#' filter(NATURE == "TAUX") %>% #rate
#' filter(CORRECTION == "CVS-CJO") #SA-WDA, seasonally adjusted, working day adjusted
#'
#' idbank = df_idbank_list_selected %>% pull(idbank)
#'
#' data = get_insee_idbank(idbank) %>%
#' add_insee_metadata()
#'
#' #plot
#' ggplot(data, aes(x = DATE, y = OBS_VALUE)) +
#' geom_col() +
#' ggtitle("French GDP growth rate, quarter-on-quarter, sa-wda") +
#' labs(subtitle = sprintf("Last updated : %s", data$TIME_PERIOD[1]))
#' }
#'
#' @export
get_insee_idbank <- function(...,
limit = TRUE,
startPeriod = NULL,
endPeriod = NULL,
firstNObservations = NULL,
lastNObservations = NULL,
includeHistory = NULL,
updatedAfter = NULL){
insee_bdm_series_link = Sys.getenv("INSEE_sdmx_link_idbank")
insee_get_idbank_limit = as.numeric(Sys.getenv("INSEE_get_idbank_limit"))
insee_sdmx_idbank_limit = as.numeric(Sys.getenv("INSEE_sdmx_idbank_limit"))
insee_download_verbose = if(Sys.getenv("INSEE_download_verbose") == "TRUE"){TRUE}else{FALSE}
if(length(list(...)) == 0){
msg = "The idbank list is missing"
message(crayon::style(msg, "red"))
return(NULL)
}else if(length(list(...)) == 1){
list_idbank = list(...)[[1]]
}else{
list_idbank = unlist(list(...))
}
check_type = list()
if (length(list_idbank) > 0){
for (i in 1:length(list_idbank)){
check_type[[length(check_type)+1]] = is.character(list_idbank[i])
}
check_type = unlist(check_type)
if (!all(check_type)){
msg = "!!! idbanks should be characters !!!"
message(crayon::style(msg, "red"))
return(NULL)
}
}
list_idbank = unique(list_idbank)
n_idbank = length(list_idbank)
if(n_idbank == 0){
msg = "Error : idbank missing"
message(crayon::style(msg, "red"))
return(NULL)
}
if(n_idbank > insee_get_idbank_limit & limit){
msg1 = sprintf("By default, this function has a %s-idbank limit.", insee_get_idbank_limit)
msg2 = "Please set limit argument to FALSE to ignore the limit."
msg3 = "Otherwise, modify the limit with the following command : Sys.setenv(INSEE_idbank_limit = 1200)."
msg4 = "Beware that it could be slow."
msg5 = "Nevertheless, the data is cached, so all queries are only run once per R session."
msg6 = "A query run twice is then almost immediate."
msg = sprintf("%s\n %s\n %s\n %s\n %s\n %s", msg1, msg2, msg3, msg4, msg5, msg6)
message(crayon::style(msg, "red"))
return(NULL)
}
if(n_idbank > insee_sdmx_idbank_limit & limit & insee_download_verbose){
msg1 = sprintf("The number of idbanks is higher than %s (insee's sdmx query limit),", insee_sdmx_idbank_limit)
msg2 = "multiple queries are then triggered."
msg3 = "To make it faster, please reduce the number of idbanks."
msg4 = "The data is cached, so all queries are only run once per R session."
msg5 = "A query run twice is then almost immediate."
msg = sprintf("%s\n %s\n %s\n %s\n %s\n", msg1, msg2, msg3, msg4, msg5)
message(crayon::style(msg, "red"))
}
if(!is.null(includeHistory)){
if(includeHistory == TRUE){includeHistory = "true"}
}
arg = c("startPeriod", "endPeriod", "firstNObservations", "lastNObservations",
"includeHistory", "updatedAfter")
null_arg_vector = unlist(lapply(arg, function(x) is.null(get(x))))
if(!all(null_arg_vector)){
get_param = function(x) if(!is.null(get(x))){return(paste0(x, "=", get(x)))}
param2add = paste0(unlist(lapply(arg, get_param)), collapse = "&")
}
max_seq = ceiling(n_idbank / insee_sdmx_idbank_limit)
if(n_idbank > insee_sdmx_idbank_limit & insee_download_verbose){
msg1 = "Data download and Dataframe build steps will"
msg = sprintf("%s be repeted %s times, unless cached data exist.\n", msg1, max_seq)
message(crayon::style(msg, "black"))
}
list_seq = lapply(1:max_seq, function(x){
return(((x-1) * insee_sdmx_idbank_limit + 1):(x * insee_sdmx_idbank_limit))
})
list_df = list()
for (j in 1:max_seq){
selected_idbank = min(list_seq[[j]]):(min(max(list_seq[[j]]), n_idbank))
list_idbank_selected = paste0(list_idbank[selected_idbank], collapse = "+")
link = sprintf("%s/%s", insee_bdm_series_link, list_idbank_selected)
if(!all(null_arg_vector)){
link = paste0(link, "?", param2add)
}
df = get_insee(link = link, step = sprintf("%s/%s", j, max_seq))
if(!is.null(df)){
list_df[[length(list_df)+1]] = df
}
}
data = dplyr::bind_rows(list_df)
return(data)
}