Skip to content

Commit

Permalink
Fast json (#3324)
Browse files Browse the repository at this point in the history
  • Loading branch information
universalmind303 committed May 9, 2022
1 parent 106a0c3 commit eb42b99
Show file tree
Hide file tree
Showing 14 changed files with 540 additions and 13 deletions.
4 changes: 2 additions & 2 deletions nodejs-polars/__tests__/dataframe.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1386,7 +1386,7 @@ describe("io", () => {
}
});
df.writeJSON(writeStream, {format:"lines"});
const newDF = pl.readJSON(body);
const newDF = pl.readJSON(body).select("foo", "bar");
expect(newDF).toFrameEqual(df);
done();
});
Expand All @@ -1396,7 +1396,7 @@ describe("io", () => {
pl.Series("bar", ["a", "b", "c"])
]);
df.writeJSON("./test.json", {format:"lines"});
const newDF = pl.readJSON("./test.json");
const newDF = pl.readJSON("./test.json").select("foo", "bar");
expect(newDF).toFrameEqual(df);
fs.rmSync("./test.json");
done();
Expand Down
3 changes: 3 additions & 0 deletions nodejs-polars/polars/dataframe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1744,6 +1744,7 @@ export const _DataFrame = (_df: any): DataFrame => {
return wrap("sampleN",
1,
withReplacement,
false,
seed
);
}
Expand All @@ -1754,13 +1755,15 @@ export const _DataFrame = (_df: any): DataFrame => {
return wrap("sampleN",
opts,
withReplacement,
false,
seed
);
}
if(typeof frac === "number") {
return wrap("sampleFrac",
frac,
withReplacement,
false,
seed
);
}
Expand Down
5 changes: 3 additions & 2 deletions nodejs-polars/polars/io.ts
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ export function scanCSV(path, options?) {
export function readJSON(pathOrBody: string | Buffer, options?: any): DataFrame
export function readJSON(pathOrBody, options = readJsonDefaultOptions) {
options = {...readJsonDefaultOptions, ...options};
let method = options.format === "lines" ? pli.readJsonLines : pli.readJson;
const extensions = [".ndjson", ".json", ".jsonl"];
if (Buffer.isBuffer(pathOrBody)) {
return _DataFrame(pli.readJson(pathOrBody, options));
Expand All @@ -240,9 +241,9 @@ export function readJSON(pathOrBody, options = readJsonDefaultOptions) {
if (typeof pathOrBody === "string") {
const inline = !isPath(pathOrBody, extensions);
if (inline) {
return _DataFrame(pli.readJson(Buffer.from(pathOrBody, "utf-8"), options));
return _DataFrame(method(Buffer.from(pathOrBody, "utf-8"), options));
} else {
return _DataFrame(pli.readJson(pathOrBody, options));
return _DataFrame(method(pathOrBody, options));
}
} else {
throw new Error("must supply either a path or body");
Expand Down
5 changes: 3 additions & 2 deletions nodejs-polars/polars/lazy/expr.ts
Original file line number Diff line number Diff line change
Expand Up @@ -927,11 +927,12 @@ export const _Expr = (_expr: any): Expr => {
throw new Error("sample_n is not yet supported for expr");
}
if(typeof frac === "number") {
return wrap("sampleFrac", {
return wrap("sampleFrac",
frac,
withReplacement,
false,
seed
});
);
}
else {
throw new TypeError("must specify either 'frac' or 'n'");
Expand Down
3 changes: 3 additions & 0 deletions nodejs-polars/polars/series/series.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1571,6 +1571,7 @@ export function _Series(_s: any): Series {
return wrap("sampleN",
1,
withReplacement,
false,
seed
);
}
Expand All @@ -1581,13 +1582,15 @@ export function _Series(_s: any): Series {
return wrap("sampleN",
opts,
withReplacement,
false,
seed
);
}
if(typeof frac === "number") {
return wrap("sampleFrac",
frac,
withReplacement,
false,
seed
);
}
Expand Down
32 changes: 30 additions & 2 deletions nodejs-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,30 @@ pub struct WriteJsonOptions {
pub format: String,
}

#[napi]
pub fn read_json_lines(
path_or_buffer: Either<String, Buffer>,
options: ReadJsonOptions,
) -> napi::Result<JsDataFrame> {
let infer_schema_length = options.infer_schema_length.unwrap_or(100) as usize;
let batch_size = options.batch_size.unwrap_or(10000) as usize;

let df = match path_or_buffer {
Either::A(path) => JsonLineReader::from_path(path)
.expect("unable to read file")
.infer_schema_len(Some(infer_schema_length))
.finish()
.map_err(JsPolarsErr::from)?,
Either::B(buf) => {
let cursor = Cursor::new(buf.as_ref());
JsonLineReader::new(cursor)
.infer_schema_len(Some(infer_schema_length))
.finish()
.map_err(JsPolarsErr::from)?
}
};
Ok(df.into())
}
#[napi]
pub fn read_json(
path_or_buffer: Either<String, Buffer>,
Expand Down Expand Up @@ -1013,11 +1037,13 @@ impl JsDataFrame {
&self,
n: i64,
with_replacement: bool,
shuffle: bool,
seed: Option<i64>,
) -> napi::Result<JsDataFrame> {

let df = self
.df
.sample_n(n as usize, with_replacement, seed.map(|s| s as u64))
.sample_n(n as usize, with_replacement, shuffle, seed.map(|s| s as u64))
.map_err(JsPolarsErr::from)?;
Ok(df.into())
}
Expand All @@ -1027,11 +1053,13 @@ impl JsDataFrame {
&self,
frac: f64,
with_replacement: bool,
shuffle: bool,
seed: Option<i64>,
) -> napi::Result<JsDataFrame> {

let df = self
.df
.sample_frac(frac, with_replacement, seed.map(|s| s as u64))
.sample_frac(frac, with_replacement, shuffle, seed.map(|s| s as u64))
.map_err(JsPolarsErr::from)?;
Ok(df.into())
}
Expand Down
4 changes: 2 additions & 2 deletions nodejs-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1184,11 +1184,11 @@ impl JsExpr {
}

#[napi]
pub fn sample_frac(&self, frac: f64, with_replacement: bool, seed: Option<i64>) -> JsExpr {
pub fn sample_frac(&self, frac: f64, with_replacement: bool, shuffle: bool, seed: Option<i64>) -> JsExpr {
let seed = seed.map(|s| s as u64);
self.inner
.clone()
.sample_frac(frac, with_replacement, seed)
.sample_frac(frac, with_replacement, shuffle, seed)
.into()
}
#[napi]
Expand Down
6 changes: 4 additions & 2 deletions nodejs-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -548,14 +548,15 @@ impl JsSeries {
&self,
n: u32,
with_replacement: bool,
shuffle: bool,
seed: Option<Wrap<u64>>,
) -> napi::Result<Self> {
// Safety:
// Wrap is transparent.
let seed: Option<u64> = unsafe { std::mem::transmute(seed) };
let s = self
.series
.sample_n(n as usize, with_replacement, seed)
.sample_n(n as usize, with_replacement, shuffle, seed)
.map_err(JsPolarsErr::from)?;
Ok(s.into())
}
Expand All @@ -565,14 +566,15 @@ impl JsSeries {
&self,
frac: f64,
with_replacement: bool,
shuffle: bool,
seed: Option<Wrap<u64>>,
) -> napi::Result<Self> {
// Safety:
// Wrap is transparent.
let seed: Option<u64> = unsafe { std::mem::transmute(seed) };
let s = self
.series
.sample_frac(frac, with_replacement, seed)
.sample_frac(frac, with_replacement, shuffle, seed)
.map_err(JsPolarsErr::from)?;
Ok(s.into())
}
Expand Down
1 change: 0 additions & 1 deletion polars/polars-io/src/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ use polars_arrow::kernels::concatenate::concatenate_owned_unchecked;
use polars_core::prelude::*;
use std::convert::TryFrom;
use std::io::{BufRead, Seek, Write};

pub enum JsonFormat {
Json,
JsonLines,
Expand Down
4 changes: 4 additions & 0 deletions polars/polars-io/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ pub mod ipc;
#[cfg(feature = "json")]
#[cfg_attr(docsrs, doc(cfg(feature = "json")))]
pub mod json;
#[cfg(feature = "json")]
#[cfg_attr(docsrs, doc(cfg(feature = "json")))]
pub mod ndjson_core;

#[cfg(any(feature = "csv-file", feature = "parquet"))]
pub mod mmap;
mod options;
Expand Down

0 comments on commit eb42b99

Please sign in to comment.