-
Notifications
You must be signed in to change notification settings - Fork 38
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: sorhawell <sorhawell@gmail.com>
- Loading branch information
1 parent
07c10af
commit cb53e75
Showing
12 changed files
with
360 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#' New LazyFrame from NDJSON | ||
#' | ||
#' @description | ||
#' Read a file from path into a polars LazyFrame. | ||
#' @name scan_ndjson | ||
#' @rdname IO_scan_ndjson | ||
#' | ||
#' @param path Path to a file or URL. It is possible to provide multiple paths | ||
#' provided that all NDJSON files have the same schema. It is not possible to | ||
#' provide several URLs. | ||
#' @param infer_schema_length Maximum number of rows to read to infer the column | ||
#' types. If set to 0, all columns will be read as UTF-8. If `NULL`, a full | ||
#' table scan will be done (slow). | ||
#' @param batch_size Number of rows that will be processed per thread. | ||
#' @param n_rows Maximum number of rows to read. | ||
#' @param low_memory Reduce memory usage (will yield a lower performance). | ||
#' @param rechunk Reallocate to contiguous memory when all chunks / files are | ||
#' parsed. | ||
#' @param row_count_name If not `NULL`, this will insert a row count column with | ||
#' the given name into the DataFrame. | ||
#' @param row_count_offset Offset to start the row_count column (only used if | ||
#' the name is set). | ||
#' @param reuse_downloaded If `TRUE`(default) and a URL was provided, cache the | ||
#' downloaded files in session for an easy reuse. | ||
#' @return A LazyFrame | ||
#' | ||
# we should use @examplesIf but altdoc doesn't know how to parse it yet | ||
#' @examples | ||
#' if (require("jsonlite", quietly = TRUE)) { | ||
#' ndjson_filename = tempfile() | ||
#' jsonlite::stream_out(iris, file(ndjson_filename), verbose = FALSE) | ||
#' pl$scan_ndjson(ndjson_filename)$collect() | ||
#' } | ||
|
||
pl$scan_ndjson = function( | ||
path, | ||
infer_schema_length = 100, | ||
batch_size = NULL, | ||
n_rows = NULL, | ||
low_memory = FALSE, | ||
rechunk = TRUE, | ||
row_count_name = NULL, | ||
row_count_offset = 0, | ||
reuse_downloaded = TRUE | ||
) { | ||
|
||
# capture all args and modify some to match lower level function | ||
args = as.list(environment()) | ||
|
||
# check if url link and predownload, wrap in result, robj_to! can unpack R-result | ||
args[['path']] = lapply( | ||
path, check_is_link, reuse_downloaded = reuse_downloaded, raise_error = TRUE | ||
) |> | ||
result() | ||
|
||
args[['reuse_downloaded']] = NULL | ||
|
||
## call low level function with args | ||
check_no_missing_args(new_from_ndjson, args) | ||
do.call(new_from_ndjson, args) |> | ||
unwrap("in pl$scan_ndjson") | ||
} | ||
|
||
#' New DataFrame from NDJSON | ||
#' | ||
#' @description | ||
#' Read a file from path into a polars DataFrame. | ||
#' @name read_ndjson | ||
#' @rdname IO_read_ndjson | ||
#' | ||
#' @param path Path to a file or URL. It is possible to provide multiple paths | ||
#' provided that all NDJSON files have the same schema. It is not possible to | ||
#' provide several URLs. | ||
#' @param infer_schema_length Maximum number of rows to read to infer the column | ||
#' types. If set to 0, all columns will be read as UTF-8. If `NULL`, a full | ||
#' table scan will be done (slow). | ||
#' @param batch_size Number of rows that will be processed per thread. | ||
#' @param n_rows Maximum number of rows to read. | ||
#' @param low_memory Reduce memory usage (will yield a lower performance). | ||
#' @param rechunk Reallocate to contiguous memory when all chunks / files are | ||
#' parsed. | ||
#' @param row_count_name If not `NULL`, this will insert a row count column with | ||
#' the given name into the DataFrame. | ||
#' @param row_count_offset Offset to start the row_count column (only used if | ||
#' the name is set). | ||
#' | ||
#' @return A DataFrame | ||
#' | ||
# we should use @examplesIf but altdoc doesn't know how to parse it yet | ||
#' @examples | ||
#' if (require("jsonlite", quietly = TRUE)) { | ||
#' ndjson_filename = tempfile() | ||
#' jsonlite::stream_out(iris, file(ndjson_filename), verbose = FALSE) | ||
#' pl$read_ndjson(ndjson_filename) | ||
#' } | ||
pl$read_ndjson = function( | ||
path, | ||
infer_schema_length = 100, | ||
batch_size = NULL, | ||
n_rows = NULL, | ||
low_memory = FALSE, | ||
rechunk = TRUE, | ||
row_count_name = NULL, | ||
row_count_offset = 0) { | ||
mc = match.call() | ||
mc[[1]] = get("pl", envir = asNamespace("polars"))$scan_ndjson | ||
eval.parent(mc)$collect() | ||
} | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
//read ndjson | ||
|
||
use crate::lazy::dataframe::LazyFrame; | ||
use crate::robj_to; | ||
use crate::rpolarserr::*; | ||
use polars::io::RowCount; | ||
|
||
//use crate::utils::wrappers::*; | ||
use extendr_api::{extendr, prelude::*, Rinternals}; | ||
use polars::prelude as pl; | ||
use polars::prelude::LazyFileListReader; | ||
use std::result::Result; | ||
|
||
#[allow(clippy::too_many_arguments)] | ||
#[extendr] | ||
pub fn new_from_ndjson( | ||
path: Robj, | ||
infer_schema_length: Robj, | ||
batch_size: Robj, | ||
n_rows: Robj, | ||
low_memory: Robj, | ||
rechunk: Robj, | ||
row_count_name: Robj, | ||
row_count_offset: Robj, | ||
) -> RResult<LazyFrame> { | ||
let offset = robj_to!(Option, u32, row_count_offset)?.unwrap_or(0); | ||
let opt_rowcount = | ||
robj_to!(Option, String, row_count_name)?.map(|name| RowCount { name, offset }); | ||
|
||
let vec_pathbuf = robj_to!(Vec, PathBuf, path)?; | ||
let linereader = match vec_pathbuf.len() { | ||
2.. => Ok(pl::LazyJsonLineReader::new_paths(vec_pathbuf.into())), | ||
1 => Ok(pl::LazyJsonLineReader::new(&vec_pathbuf[0])), | ||
_ => rerr().plain("path cannot have zero length").bad_arg("path"), | ||
}?; | ||
|
||
linereader | ||
.with_infer_schema_length(robj_to!(Option, usize, infer_schema_length)?) | ||
.with_batch_size(robj_to!(Option, usize, batch_size)?) | ||
.with_n_rows(robj_to!(Option, usize, n_rows)?) | ||
.low_memory(robj_to!(bool, low_memory)?) | ||
.with_row_count(opt_rowcount) | ||
.with_rechunk(robj_to!(bool, rechunk)?) | ||
.finish() | ||
.map_err(polars_to_rpolars_err) | ||
.map(LazyFrame) | ||
} | ||
|
||
extendr_module! { | ||
mod read_ndjson; | ||
fn new_from_ndjson; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.