/
Testing-CSV-vs-Parquet.R
134 lines (115 loc) · 4.7 KB
/
Testing-CSV-vs-Parquet.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
library(microbenchmark)
library(readr)
library(arrow)
url <- "https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2022/download/Fahrzeiten_SOLL_IST_20221225_20221231.csv"
filename <- "fahrzeiten_soll_ist_20221225_20221231.csv"
# Initally download the file from opendata.swiss for a fair comparison (no caching...)
download.file(url = url, destfile = filename)
# Define functions for reading CSV file
# Load CSV normally
read_csv_without_specifications <- function(){
df <- read_csv(filename)
return(df)
}
read_csv_specifications_using_all <- function(){
df <- read_csv(filename, guess_max = Inf)
return(df)
}
# Load CSV while indicating attributes of type 'date'
read_csv_with_date_specifications <- function(){
df <- read_csv(filename,
col_types = cols(
betriebsdatum = col_date(format = "%d.%m.%y"),
datum_von = col_date(format = "%d.%m.%y"),
datum_nach = col_date(format = "%d.%m.%y")
)
)
return(df)
}
# Load CSV while indicating all attribute types
read_csv_with_all_specifications <- function(){
df <- read_csv(filename,
col_types = cols(
linie = col_integer(),
richtung = col_integer(),
betriebsdatum = col_date(format = "%d.%m.%y"),
fahrzeug = col_integer(),
kurs = col_integer(),
seq_von = col_integer(),
halt_diva_von = col_integer(),
halt_punkt_diva_von = col_integer(),
halt_kurz_von1 = col_character(),
datum_von = col_date(format = "%d.%m.%y"),
soll_an_von = col_integer(),
ist_an_von = col_integer(),
soll_ab_von = col_integer(),
ist_ab_von = col_integer(),
seq_nach = col_integer(),
halt_diva_nach = col_integer(),
halt_punkt_diva_nach = col_integer(),
halt_kurz_nach1 = col_character(),
datum_nach = col_date(format = "%d.%m.%y"),
soll_an_nach = col_integer(),
ist_an_nach1 = col_integer(),
soll_ab_nach = col_integer(),
ist_ab_nach = col_integer(),
fahrt_id = col_integer(),
fahrweg_id = col_integer(),
fw_no = col_integer(),
fw_typ = col_integer(),
fw_kurz = col_integer(),
fw_lang = col_character(),
umlauf_von = col_integer(),
halt_id_von = col_integer(),
halt_id_nach = col_integer(),
halt_punkt_id_von = col_integer(),
halt_punkt_id_nach = col_integer()
)
)
return(df)
}
# Benchmark functions
reading_csv = microbenchmark(
'Read CSV w/o specs' = read_csv_without_specifications(),
'Read CSV w/ date specs' = read_csv_with_date_specifications(),
'Read CSV w/ all specs' = read_csv_with_all_specifications(),
times = 10)
reading_csv
microbenchmark:::boxplot.microbenchmark(reading_csv)
reading_csv2 = microbenchmark(
'Read CSV w/ specs based on all' = read_csv_specifications_using_all(),
'Read CSV w/o specs' = read_csv_without_specifications(),
times = 10)
reading_csv2
microbenchmark:::boxplot.microbenchmark(reading_csv2)
# Look at size of objects (dataframes) in memory
df = read_csv_without_specifications()
df2 = read_csv_with_date_specifications()
df3 = read_csv_with_all_specifications()
object.size(df)
object.size(df2)
object.size(df3)
# Write dataframes to Parquet files
write_parquet(df, "parquet-test--df.parquet")
write_parquet(df2, "parquet-test--df2.parquet")
write_parquet(df3, "parquet-test--df3.parquet")
# Loading data from Parquet files
df_ <- read_parquet("parquet-test--df.parquet")
df2_ <- read_parquet("parquet-test--df2.parquet")
df3_ <- read_parquet("parquet-test--df3.parquet")
object.size(df_)
object.size(df2_)
object.size(df3_)
reading_parquet = microbenchmark(
'Read Parquet w/o specs' = read_parquet("parquet-test--df.parquet"),
'Read Parquet w/ date specs' = read_parquet("parquet-test--df2.parquet"),
'Read Parquet w/ all specs' = read_parquet("parquet-test--df3.parquet"),
times = 10)
reading_parquet
microbenchmark:::boxplot.microbenchmark(reading_parquet)
reading_csv_and_parquet = microbenchmark(
'Read CSV w/o specs' = read_csv_without_specifications(),
'Read Parquet w/ all specs' = read_parquet("parquet-test--df3.parquet"),
times = 10)
reading_csv_and_parquet
microbenchmark:::boxplot.microbenchmark(reading_csv_and_parquet)