Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
285 lines (268 sloc) 6.4 KB
file <- commandArgs(trailingOnly = TRUE)[[1]]
desc <- c("setup", "read", "print", "head", "tail", "sample", "filter", "aggregate")
library(vroom)
fields <- vroom(col_names = c("begin", "end", "width", "col_names"), delim = "\t",
"1 1 1 RECTYPE
2 8 7 SERIALNO
9 9 1 SAMPLE
10 11 2 STATE
12 12 1 REGION
13 13 1 DIVISION
14 18 5 PUMA5
19 23 5 PUMA1
24 27 4 MSACMSA5
28 31 4 MSAPMSA5
32 35 4 MSACMSA1
36 39 4 MSAPMSA1
40 41 2 AREATYP5
42 43 2 AREATYP1
44 57 14 TOTPUMA5
58 71 14 LNDPUMA5
72 85 14 TOTPUMA1
86 99 14 LNDPUMA1
100 101 2 SUBSAMPL
102 105 4 HWEIGHT
106 107 2 PERSONS
108 108 1 UNITTYPE
109 109 1 HSUB
110 110 1 HAUG
111 111 1 VACSTAT
112 112 1 VACSTATA
113 113 1 TENURE
114 114 1 TENUREA
115 116 2 BLDGSZ
117 117 1 BLDGSZA
118 118 1 YRBUILT
119 119 1 YRBUILTA
120 120 1 YRMOVED
121 121 1 YRMOVEDA
122 122 1 ROOMS
123 123 1 ROOMSA
124 124 1 BEDRMS
125 125 1 BEDRMSA
126 126 1 CPLUMB
127 127 1 CPLUMBA
128 128 1 CKITCH
129 129 1 CKITCHA
130 130 1 PHONE
131 131 1 PHONEA
132 132 1 FUEL
133 133 1 FUELA
134 134 1 VEHICL
135 135 1 VEHICLA
136 136 1 BUSINES
137 137 1 BUSINESA
138 138 1 ACRES
139 139 1 ACRESA
140 140 1 AGSALES
141 141 1 AGSALESA
142 145 4 ELEC
146 146 1 ELECA
147 150 4 GAS
151 151 1 GASA
152 155 4 WATER
156 156 1 WATERA
157 160 4 OIL
161 161 1 OILA
162 165 4 RENT
166 166 1 RENTA
167 167 1 MEALS
168 168 1 MEALSA
169 169 1 MORTG1
170 170 1 MORTG1A
171 175 5 MRT1AMT
176 176 1 MRT1AMTA
177 177 1 MORTG2
178 178 1 MORTG2A
179 183 5 MRT2AMT
184 184 1 MRT2AMTA
185 185 1 TAXINCL
186 186 1 TAXINCLA
187 188 2 TAXAMT
189 189 1 TAXAMTA
190 190 1 INSINCL
191 191 1 INSINCLA
192 195 4 INSAMT
196 196 1 INSAMTA
197 200 4 CONDFEE
201 201 1 CONDFEEA
202 203 2 VALUE
204 204 1 VALUEA
205 205 1 MHLOAN
206 206 1 MHLOANA
207 211 5 MHCOST
212 212 1 MHCOSTA
213 213 1 HHT
214 215 2 P65
216 217 2 P18
218 219 2 NPF
220 221 2 NOC
222 223 2 NRC
224 224 1 PSF
225 225 1 PAOC
226 226 1 PARC
227 227 1 SVAL
228 232 5 SMOC
233 235 3 SMOCAPI
236 236 1 SRNT
237 240 4 GRENT
241 243 3 GRAPI
244 244 1 FNF
245 245 1 HHL
246 246 1 LNGI
247 247 1 WIF
248 248 1 EMPSTAT
249 250 2 WORKEXP
251 258 8 HINC
259 266 8 FINC
")
fields$begin <- fields$begin - 1
types <- cols(
.default = col_double(),
RECTYPE = col_character(),
SERIALNO = col_character(),
STATE = col_character(),
PUMA5 = col_character(),
PUMA1 = col_character(),
MSACMSA5 = col_character(),
MSAPMSA5 = col_character(),
MSACMSA1 = col_character(),
MSAPMSA1 = col_character(),
AREATYP5 = col_character(),
AREATYP1 = col_character(),
LNDPUMA5 = col_character(),
TOTPUMA1 = col_character(),
LNDPUMA1 = col_character(),
SUBSAMPL = col_character(),
HWEIGHT = col_character(),
PERSONS = col_character(),
BLDGSZ = col_character(),
ELEC = col_character(),
GAS = col_character(),
WATER = col_character(),
OIL = col_character(),
RENT = col_character(),
MRT1AMT = col_character(),
MRT2AMT = col_character(),
TAXAMT = col_character(),
INSAMT = col_character(),
CONDFEE = col_character(),
VALUE = col_character(),
MHCOST = col_character(),
P65 = col_character(),
P18 = col_character(),
NPF = col_character(),
NOC = col_character(),
NRC = col_character(),
SMOC = col_character(),
SMOCAPI = col_character(),
GRENT = col_character(),
GRAPI = col_character(),
WORKEXP = col_character(),
HINC = col_character(),
FINC = col_character()
)
vroom <- function(file, fields, types, desc) {
bench::workout(description = desc,
{
({ library(vroom); library(dplyr) })
x <- vroom_fwf(file, fields, col_types = types)
print(x)
a <- head(x)
b <- tail(x)
c <- sample_n(x, 100)
d <- filter(x, PERSONS == "06")
e <- group_by(x, STATE) %>% summarise(avg_TOTPUM5 = mean(TOTPUMA5))
}
)
}
`vroom (full altrep)` <- function(file, fields, types, desc) {
bench::workout(description = desc,
{
({library(vroom); library(dplyr)})
x <- vroom_fwf(file, fields, col_types = types, altrep_opts = TRUE)
print(x)
a <- head(x)
b <- tail(x)
c <- sample_n(x, 100)
d <- filter(x, PERSONS == "06")
e <- group_by(x, STATE) %>% summarise(avg_TOTPUM5 = mean(TOTPUMA5))
}
)
}
`vroom (no altrep)` <- function(file, fields, types, desc) {
bench::workout(description = desc,
{
({library(vroom); library(dplyr)})
x <- vroom_fwf(file, fields, col_types = types, altrep_opts = FALSE)
print(x)
a <- head(x)
b <- tail(x)
c <- sample_n(x, 100)
d <- filter(x, PERSONS == "06")
e <- group_by(x, STATE) %>% summarise(avg_TOTPUM5 = mean(TOTPUMA5))
}
)
}
readr <- function(file, fields, types, desc) {
bench::workout(description = desc,
{
({ library(readr); library(dplyr) })
x <- read_fwf(file, fields, col_types = types)
print(x)
a <- head(x)
b <- tail(x)
c <- sample_n(x, 100)
d <- filter(x, PERSONS == "06")
e <- group_by(x, STATE) %>% summarise(avg_TOTPUM5 = mean(TOTPUMA5))
}
)
}
read.delim <- function(file, fields, types, desc) {
bench::workout(description = desc,
{
({})
x <- read.fwf(file, widths = fields$width, col.names = fields$col_names)
print(head(x, 10))
a <- head(x)
b <- tail(x)
c <- x[sample(NROW(x), 100), ]
d <- x[x$PERSONS== "06", ]
e <- tapply(x$TOTPUMA5, x$STATE, mean)
}
)
}
times <- list(
vroom_dplyr = callr::r(vroom, list(file, fields, types, desc)),
`vroom (full altrep)_dplyr` = callr::r(`vroom (full altrep)`, list(file, fields, types, desc)),
`vroom (no altrep)_dplyr` = callr::r(`vroom (no altrep)`, list(file, fields, types, desc)),
readr = callr::r(readr, list(file, fields, types, desc)),
read.delim = callr::r(read.delim, list(file, fields, types, desc))
)
library(purrr)
library(tidyr)
library(dplyr)
library(forcats)
library(bench)
data <- vroom_fwf(file, fields, col_types = types)
tm_df <- map_dfr(times, function(x) {
tibble::tibble(
op = factor(x$exprs, levels = desc, ordered = TRUE),
process = as.numeric(x$process),
real = as.numeric(x$real),
)
}, .id = "package") %>%
gather(type, time, -package, -op) %>%
mutate(size = file.size(file),
rows = nrow(data),
cols = ncol(data)
)
vroom::vroom_write(
tm_df,
here::here("inst", "bench", "fwf-times.tsv"),
delim = "\t")
vroom::vroom_write(
sessioninfo::package_info(c("vroom", "readr", "dplyr", "data.table", "base"), dependencies = FALSE, include_base = TRUE),
here::here("inst", "bench", "sessioninfo.tsv"),
delim = "\t"
)
You can’t perform that action at this time.