-
Notifications
You must be signed in to change notification settings - Fork 0
/
inmet_download_1.R
227 lines (191 loc) · 8.13 KB
/
inmet_download_1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# ---------------------------------------------------------------------------- #
# Name : R INMET Download (1)
# Description : Download meteorological data from the Instituto Nacional de
# Meteorologia (IMNET) using the url https://portal.inmet.gov.br/dadoshistoricos
# Written by : Rodrigo Lustosa
# Writing date : 16 Jan 2023 12:04 (GMT -03)
# Note: : At this url, data is divided by year. Data from every station
# will be in the same zip file, so all stations will be downloaded.
# ---------------------------------------------------------------------------- #
# initialization ----------------------------------------------------------
# packages
library(tidyverse)
library(stringr)
library(stringi)
library(lubridate)
library(RCurl)
# directory and file names
dir_data_input <- "database/aux/zip_files_yearly_and_allstations"
dir_data_output <- "database/output"
dir_data_temp <- "database/temp"
file_output <- "01_inmet.csv"
# date and hour information
date_hour_start <- ymd_hm("2018-01-01 00:00")
date_hour_end <- ymd_hm("2022-01-01 00:00")
# code stations to be used. If empty, all stations will be selected
# for more IDs, check: https://mapas.inmet.gov.br/
station_ids <- c("A701","A755","A771")
# functions ---------------------------------------------------------------
# Name : Return Urls from INMET to download
# Description : Receive years as input and return zip files urls to download
# Written by : Rodrigo Lustosa
# Writing date : 17 Jan 2023 16:50 (GMT -03)
return_urls_inmet_download <- function(years) {
url_prefix <- "https://portal.inmet.gov.br/uploads/dadoshistoricos/"
url_posfix <- ".zip"
return(str_c(url_prefix,years,url_posfix))
}
# Name : Remove complex format
# Description : Remove all symbols that are not proper for a header or
# something related
# Written by : Rodrigo Lustosa
# Writing date : 19 Jan 2023 15:43 (GMT -03)
rm.complex.format <- function(string){
# remove accents and other complex symbols
new_string <- stri_trans_general(string, "Latin-ASCII")
# remove characters inside parenthesis
new_string <- str_remove_all(new_string,"\\(.*\\)")
# remove spaces after and before string
new_string <- str_trim(new_string)
# replace spaces by underline
new_string <- str_replace_all(new_string," ","_")
# change upper to lower case
new_string <- str_to_lower(new_string)
# remove all other characters that aren't letters, digits or underline
new_string <- str_remove_all(new_string,"[^a-zA-Z0-9_\\s]")
# replace repeated underlines by a single underline
new_string <- str_replace_all(new_string,"_+","_")
return(new_string)
}
# Name : Download INMET files (1)
# Description : download raw zip INMET files by year and save them at dir_path
# Written by : Rodrigo Lustosa
# Writing date : 17 Jan 2023 17:18 (GMT -03)
download_inmet_files_1 <- function(years,dir_path) {
# basic info
n_files <- length(years)
files_names <- str_c(years,".zip")
urls <- return_urls_inmet_download(years)
path <- file.path(dir_path, files_names)
# create folder if it does not exist
if (!file.exists(dir_data_input))
dir.create(dir_data_input)
# download each file
for (i in 1:n_files) {
message(str_c("Downloading year ",years[i]))
# download if url exists and file was not downloaded yet
if (!file.exists(path[i]))
if(url.exists(urls[i]))
download.file(urls[i], destfile = path[i], method="curl")
}
}
# Name : Read and tidy up INMET files (1)
# Description : Read raw INMET files by station, merge and tidy up
# Written by : Rodrigo Lustosa
# Writing date : 26 Jan 2023 13:33 (GMT -03)
read_n_tidy_inmet_files_1 <- function(station_ids,date_hour_start,date_hour_end,
dir_read,dir_data_temp){
# years to download
year_start <- year(date_hour_start)
year_end <- year(date_hour_end)
all_years <- year_start:year_end
n_years <- length(all_years)
dados <- vector("list",n_years)
for(k in 1:n_years){
# select year
y <- all_years[k]
# zip file path
path <- file.path(dir_read, str_c(y,".zip"))
# extract all file and directory names inside zip
allzipfiles <- unzip(path, list=TRUE)
# separate file and directory names
if(allzipfiles$Length[1] == 0){
# case where data is inside a directory (length zero and first of the list)
zipfolder <- allzipfiles$Name[1]
filenames_with_zipdir <- allzipfiles$Name[-1]
filenames <- str_remove(filenames_with_zipdir,zipfolder)
}else{
# case where data is not inside a directory
zipfolder <- ""
filenames_with_zipdir <- allzipfiles$Name
filenames <- allzipfiles$Name
}
# extract station ids for each file
codigos <- str_match(filenames,
"[a-zA-Z]+_[A-Z]{1,2}_[A-Z]{1,2}_([a-zA-Z\\d]{4,6})_.*")
codigos <- codigos[,2]
n_files <- length(filenames)
dados[[k]] <- vector("list",n_files)
# only stations that are required
if(is.null(station_ids))
fs <- 1:n_files else # all stations were required (implicit by empty ids)
fs <- which(codigos %in% station_ids) # required stations were given
for (f in fs){
# unzip file f in temporary directory
file_unziped <- unzip(path,filenames_with_zipdir[f],
exdir = dir_data_temp,junkpaths = T)
# read first lines, with basic info
con <- file(file_unziped,encoding = "ISO-8859-15")
first_lines <- readLines(con,9)
close(con)
# read data
dados[[k]][[f]] <- read.csv2(file_unziped, skip = 9, na.strings = "-9999",
header = F, fileEncoding = "ISO-8859-15")
# delete unzipped file
file.remove(file_unziped)
# extract raw station basic info and raw dataframe header
basic_info <- first_lines[-9]
csv_header <- first_lines[ 9]
# tidy basic info
basic_info <- str_split(basic_info,":;")
n_basic_info <- length(basic_info)
for (i in 1:n_basic_info) {
names(basic_info)[i] <- basic_info[[i]][1]
basic_info[[i]] <- basic_info[[i]][2]
}
names(basic_info) <- rm.complex.format(names(basic_info))
# correct cases where files were already incorrect
names(basic_info)[which(names(basic_info) == "regio")] <- "regiao"
names(basic_info)[which(names(basic_info) == "estaco")] <- "estacao"
names(basic_info)[which(names(basic_info) == "data_de_fundaco")] <-
"data_de_fundacao"
# tidy header
csv_header <- str_split(csv_header,";")[[1]]
csv_header <- rm.complex.format(csv_header)
# change hour column name to be equal for all files
csv_header[which(csv_header == "hora_utc")] <- "hora"
# insert header
names(dados[[k]][[f]]) <- csv_header
# tidy and filter data
dados[[k]][[f]] <- dados[[k]][[f]] %>%
select(- "") %>% # remove empty column
mutate(data = ymd_hm(paste(data, hora)),.keep="unused") %>% # merge date and hours
mutate(codigo = basic_info$codigo, .before = 1) %>%
filter(data >= date_hour_start & data <= date_hour_end)
}
# merge dataframes from year y
dados[[k]] <- bind_rows(dados[[k]])
}
# merge all dataframes
dados <- bind_rows(dados)
return(dados)
}
# data information --------------------------------------------------------
# years to download
year_start <- year(date_hour_start)
year_end <- year(date_hour_end)
all_years <- year_start:year_end
# download files ----------------------------------------------------------
download_inmet_files_1(all_years,dir_data_input)
# read files --------------------------------------------------------------
dados <- read_n_tidy_inmet_files_1(station_ids,date_hour_start,date_hour_end,
dir_data_input,dir_data_temp)
# write final file --------------------------------------------------------
# create folder if it does not exist
if (!file.exists(dir_data_output))
dir.create(dir_data_output)
# free up unused memory
gc()
# write data
path <- file.path(dir_data_output, file_output)
write_csv(dados,path,na = "")