Skip to content

Commit

Permalink
use native ndjson reader (#4196)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 31, 2022
1 parent 1939b5e commit 7d6ba8d
Showing 1 changed file with 17 additions and 29 deletions.
46 changes: 17 additions & 29 deletions polars/polars-io/src/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,11 @@
use crate::mmap::{MmapBytesReader, ReaderBytes};
use crate::prelude::*;
use arrow::array::StructArray;
use arrow::io::ndjson::read::FallibleStreamingIterator;
pub use arrow::{
error::Result as ArrowResult,
io::{json, ndjson},
};
pub use arrow::{error::Result as ArrowResult, io::json};
use polars_arrow::conversion::chunk_to_struct;
use polars_arrow::kernels::concatenate::concatenate_owned_unchecked;
use polars_core::prelude::*;
use std::convert::TryFrom;
use std::io::{Cursor, Seek, Write};
use std::io::Write;
use std::ops::Deref;

pub enum JsonFormat {
Expand Down Expand Up @@ -166,11 +161,11 @@ where
}

fn finish(self) -> Result<DataFrame> {
let mmap_read: ReaderBytes = (&self.reader).into();
let bytes = mmap_read.deref();
let rb: ReaderBytes = (&self.reader).into();

let out = match self.json_format {
JsonFormat::Json => {
let bytes = rb.deref();
let json_value = arrow::io::json::read::json_deserializer::parse(bytes)
.map_err(|err| PolarsError::ComputeError(format!("{:?}", err).into()))?;
// likely struct type
Expand All @@ -182,28 +177,21 @@ where
DataFrame::try_from(arr.clone())
}
JsonFormat::JsonLines => {
let mut file = Cursor::new(bytes);

let dtype = ndjson::read::infer(&mut file, self.infer_schema_len)?;
file.rewind()?;

let mut reader = ndjson::read::FileReader::new(
&mut file,
vec!["".to_string(); self.batch_size],
let mut json_reader = CoreJsonReader::new(
rb,
None,
None,
None,
);
let mut arrays = vec![];
// `next` is IO-bounded
while let Some(rows) = reader.next()? {
// `deserialize` is CPU-bounded
let array = ndjson::read::deserialize(rows, dtype.clone())?;
arrays.push(array);
1024, // sample size
1 << 18,
false,
self.infer_schema_len,
)?;
let mut df: DataFrame = json_reader.as_df()?;
if self.rechunk {
df.as_single_chunk_par();
}
let arr = concatenate_owned_unchecked(&arrays)?;
let arr = arr.as_any().downcast_ref::<StructArray>().ok_or_else(|| {
PolarsError::ComputeError("only can deserialize json objects".into())
})?;
DataFrame::try_from(arr.clone())
Ok(df)
}
}?;

Expand Down

0 comments on commit 7d6ba8d

Please sign in to comment.