From 870caad1950c6263bdb9217c2e89f33e01ab71b6 Mon Sep 17 00:00:00 2001 From: Richard Plevin Date: Mon, 10 Jun 2019 15:24:53 -0700 Subject: [PATCH 1/4] Add support for savestreaming method --- src/csv_writer.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/csv_writer.jl b/src/csv_writer.jl index d62f660..150f619 100644 --- a/src/csv_writer.jl +++ b/src/csv_writer.jl @@ -95,3 +95,24 @@ end function fileio_save(s::FileIO.Stream{FileIO.format"TSV"}, data; delim='\t', quotechar='"', escapechar='"', nastring="NA", header=true) return _save(s.io, data, delim=delim, quotechar=quotechar, escapechar=escapechar, nastring=nastring, header=header) end + +# +# Streaming version writes header (if any) on first call, then appends on subsequent calls. +# +const CSV_or_TSV = Union{FileIO.format"CSV", FileIO.format"TSV"} + +_delim(T, delim) = (delim === nothing ? (T <: FileIO.format"CSV" ? ',' : '\t') : delim) + +function fileio_savestreaming(f::FileIO.File{T}, data; delim=nothing, quotechar='"', escapechar='"', nastring="NA", + header=true) where T <: CSV_or_TSV + io = open(f.filename, "w") + _save(io, data; delim=_delim(T, delim), quotechar=quotechar, escapechar=escapechar, nastring=nastring, header=header) + + return FileIO.Stream(T, io, f.filename) +end + +function fileio_savestreaming(s::FileIO.Stream{T}, data; delim=nothing, quotechar='"', escapechar='"', nastring="NA", + header=false) where T <: CSV_or_TSV + return _save(s.io, data; delim=_delim(T, delim), quotechar=quotechar, escapechar=escapechar, nastring=nastring, header=header) +end + From bdfc9828cf0a0cd5ab617d01390fb0b1e0659f94 Mon Sep 17 00:00:00 2001 From: Richard Plevin Date: Tue, 11 Jun 2019 15:38:28 -0700 Subject: [PATCH 2/4] Add tests for savestreaming --- Project.toml | 3 ++- test/runtests.jl | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 8a6ebec..672940f 100644 --- a/Project.toml +++ b/Project.toml @@ -16,9 +16,10 @@ TextParse = "e0df1984-e451-5cb5-8b61-797a481e67e3" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" [targets] -test = ["Test"] +test = ["Test", "DataFrames"] [compat] CodecZlib = "≥ 0.5.2" diff --git a/test/runtests.jl b/test/runtests.jl index 8007efc..6582f74 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -212,8 +212,31 @@ end @test showable("text/html", x2) == true @test showable("application/vnd.dataresource+json", x2) == true end + +end + +@testset "savestreaming" begin + using DataFrames + df = DataFrame(A = 1:2:1000, B = repeat(1:10, inner=50), C = 1:500) + df1 = df[1:5, :] + df2 = df[6:10, :] + + # Test both csv and tsv formats + for ext in ("csv", "tsv") + fname = "output.$ext" + s = savestreaming(fname, df1) + savestreaming(s, df2) + savestreaming(s, df2) # add this slice twice + close(s) + new_df = DataFrame(load(fname)) + @test new_df[1:5,:] == df1 + @test new_df[6:10,:] == df2 + @test new_df[11:15,:] == df2 + + rm(fname) + end end end # Outer-most testset From a3f4e64f9b1be625adf66ce26252c3f10bc201e3 Mon Sep 17 00:00:00 2001 From: David Anthoff Date: Fri, 30 Aug 2019 15:15:59 -0700 Subject: [PATCH 3/4] Adapt FileIO streaming API --- src/csv_writer.jl | 37 +++++++++++++++++++++++++++++++------ test/runtests.jl | 4 ++-- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/csv_writer.jl b/src/csv_writer.jl index 150f619..0d2a2b3 100644 --- a/src/csv_writer.jl +++ b/src/csv_writer.jl @@ -101,18 +101,43 @@ end # const CSV_or_TSV = Union{FileIO.format"CSV", FileIO.format"TSV"} -_delim(T, delim) = (delim === nothing ? (T <: FileIO.format"CSV" ? ',' : '\t') : delim) +_delim(T) = T <: FileIO.format"CSV" ? ',' : '\t' + +mutable struct CSVFileSaveStream{T} + io::T + first_data_written::Bool + delim::Char + quotechar::Char + escapechar::Char + nastring::AbstractString + header::Bool +end -function fileio_savestreaming(f::FileIO.File{T}, data; delim=nothing, quotechar='"', escapechar='"', nastring="NA", +function fileio_savestreaming(f::FileIO.File{T}, data=nothing; delim=_delim(T), quotechar='"', escapechar='"', nastring="NA", header=true) where T <: CSV_or_TSV io = open(f.filename, "w") - _save(io, data; delim=_delim(T, delim), quotechar=quotechar, escapechar=escapechar, nastring=nastring, header=header) - return FileIO.Stream(T, io, f.filename) + if data!==nothing + _save(io, data; delim=delim, quotechar=quotechar, escapechar=escapechar, nastring=nastring, header=header) + end + + return CSVFileSaveStream(io, data!==nothing, delim, quotechar, escapechar, nastring, header) end -function fileio_savestreaming(s::FileIO.Stream{T}, data; delim=nothing, quotechar='"', escapechar='"', nastring="NA", +function fileio_savestreaming(s::FileIO.Stream{T}, data=nothing; delim=_delim(T), quotechar='"', escapechar='"', nastring="NA", header=false) where T <: CSV_or_TSV - return _save(s.io, data; delim=_delim(T, delim), quotechar=quotechar, escapechar=escapechar, nastring=nastring, header=header) + + if data!==nothing + _save(s.io, data; delim=delim, quotechar=quotechar, escapechar=escapechar, nastring=nastring, header=header) + end + + return CSVFileSaveStream(s.io, data!==nothing, delim, quotechar, escapechar, nastring, header) +end + +function Base.write(s::CSVFileSaveStream, data) + _save(s.io, data; delim=s.delim, quotechar=s.quotechar, escapechar=s.escapechar, nastring=s.nastring, header=s.first_data_written ? false : header) end +function Base.close(s::CSVFileSaveStream) + close(s.io) +end diff --git a/test/runtests.jl b/test/runtests.jl index 6582f74..568b172 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -226,8 +226,8 @@ end for ext in ("csv", "tsv") fname = "output.$ext" s = savestreaming(fname, df1) - savestreaming(s, df2) - savestreaming(s, df2) # add this slice twice + write(s, df2) + write(s, df2) # add this slice twice close(s) new_df = DataFrame(load(fname)) From 76afde8a961dad298ef5ac4eebdcf9bbc3ae9d36 Mon Sep 17 00:00:00 2001 From: David Anthoff Date: Fri, 30 Aug 2019 15:16:14 -0700 Subject: [PATCH 4/4] Remove a debug output --- test/runtests.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 568b172..f1a3c2c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -92,7 +92,6 @@ end output_filename4 = tempname() * ".csv" try - @show output_filename4 array |> save(output_filename4, quotechar=nothing) finally