Skip to content

Commit

Permalink
rust polars 0.22.0 & update arrow to crates.io: ~2x json parsing im…
Browse files Browse the repository at this point in the history
…provement (#3588)
  • Loading branch information
ritchie46 committed Jun 6, 2022
1 parent f123f80 commit 61e7627
Show file tree
Hide file tree
Showing 13 changed files with 104 additions and 89 deletions.
12 changes: 6 additions & 6 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars"
version = "0.21.1"
version = "0.22.0"
authors = ["ritchie46 <ritchie46@gmail.com>"]
edition = "2021"
keywords = ["dataframe", "query-engine", "arrow"]
Expand Down Expand Up @@ -242,11 +242,11 @@ bench = [
]

[dependencies]
polars-core = { version = "0.21.1", path = "./polars-core", features = ["docs", "private"], default-features = false }
polars-io = { version = "0.21.1", path = "./polars-io", features = ["private"], default-features = false, optional = true }
polars-lazy = { version = "0.21.1", path = "./polars-lazy", features = ["private"], default-features = false, optional = true }
polars-ops = { version = "0.21.1", path = "./polars-ops" }
polars-time = { version = "0.21.1", path = "./polars-time", default-features = false, optional = true }
polars-core = { version = "0.22.1", path = "./polars-core", features = ["docs", "private"], default-features = false }
polars-io = { version = "0.22.0", path = "./polars-io", features = ["private"], default-features = false, optional = true }
polars-lazy = { version = "0.22.0", path = "./polars-lazy", features = ["private"], default-features = false, optional = true }
polars-ops = { version = "0.22.1", path = "./polars-ops" }
polars-time = { version = "0.22.0", path = "./polars-time", default-features = false, optional = true }

[dev-dependencies]
ahash = "0.7"
Expand Down
6 changes: 3 additions & 3 deletions polars/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars-arrow"
version = "0.21.1"
version = "0.22.0"
authors = ["ritchie46 <ritchie46@gmail.com>"]
edition = "2021"
license = "MIT"
Expand All @@ -9,10 +9,10 @@ description = "Arrow interfaces for Polars DataFrame library"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7014e28de391960f9aac578eada14796bf6950d2", features = ["compute_concatenate"], default-features = false }
# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7014e28de391960f9aac578eada14796bf6950d2", features = ["compute_concatenate"], default-features = false }
# arrow = { package = "arrow2", path = "../../../arrow2", features = ["compute_concatenate"], default-features = false }
# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "improve_mutable", features = ["compute_concatenate"], default-features = false }
# arrow = { package = "arrow2", version = "0.11", default-features = false, features = ["compute_concatenate"] }
arrow = { package = "arrow2", version = "0.12", default-features = false, features = ["compute_concatenate"] }
hashbrown = "0.12"
num = "^0.4"
serde = { version = "1", features = ["derive"], optional = true }
Expand Down
12 changes: 6 additions & 6 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars-core"
version = "0.21.1"
version = "0.22.1"
authors = ["ritchie46 <ritchie46@gmail.com>"]
edition = "2021"
license = "MIT"
Expand Down Expand Up @@ -161,8 +161,8 @@ jsonpath_lib = { version = "0.3.0", optional = true, git = "https://github.com/r
ndarray = { version = "0.15", optional = true, default_features = false }
num = "^0.4"
once_cell = "1"
polars-arrow = { version = "0.21.1", path = "../polars-arrow", features = ["compute"] }
polars-utils = { version = "0.21.1", path = "../polars-utils" }
polars-arrow = { version = "0.22.0", path = "../polars-arrow", features = ["compute"] }
polars-utils = { version = "0.22.0", path = "../polars-utils" }
rand = { version = "0.8", optional = true, features = ["small_rng", "std"] }
rand_distr = { version = "0.4", optional = true }
rayon = "1.5"
Expand All @@ -174,12 +174,12 @@ thiserror = "^1.0"

[dependencies.arrow]
package = "arrow2"
git = "https://github.com/jorgecarleitao/arrow2"
# git = "https://github.com/jorgecarleitao/arrow2"
# git = "https://github.com/ritchie46/arrow2"
rev = "7014e28de391960f9aac578eada14796bf6950d2"
# rev = "7014e28de391960f9aac578eada14796bf6950d2"
# path = "../../../arrow2"
# branch = "improve_mutable"
# version = "0.11"
version = "0.12"
default-features = false
features = [
"compute_aggregate",
Expand Down
1 change: 1 addition & 0 deletions polars/polars-core/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ pub use once_cell;
#[cfg(feature = "private")]
pub use rayon;
#[cfg(feature = "private")]
#[cfg(any(feature = "strings", feature = "temporal"))]
pub use regex;
16 changes: 8 additions & 8 deletions polars/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars-io"
version = "0.21.1"
version = "0.22.0"
authors = ["ritchie46 <ritchie46@gmail.com>"]
edition = "2021"
license = "MIT"
Expand Down Expand Up @@ -35,9 +35,9 @@ private = ["polars-time/private"]
[dependencies]
ahash = "0.7"
anyhow = "1.0"
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7014e28de391960f9aac578eada14796bf6950d2", default-features = false }
# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7014e28de391960f9aac578eada14796bf6950d2", default-features = false }
# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "improve_mutable", default-features = false }
# arrow = { package = "arrow2", version = "0.11", default-features = false }
arrow = { package = "arrow2", version = "0.12", default-features = false }
# arrow = { package = "arrow2", path = "../../../arrow2", default-features = false }
csv-core = { version = "0.1.10", optional = true }
dirs = "4.0"
Expand All @@ -47,14 +47,14 @@ memchr = "2.4"
memmap = { package = "memmap2", version = "0.5.2", optional = true }
num = "^0.4"
once_cell = "1"
polars-arrow = { version = "0.21.1", path = "../polars-arrow" }
polars-core = { version = "0.21.1", path = "../polars-core", features = ["private"], default-features = false }
polars-time = { version = "0.21.1", path = "../polars-time", features = ["private"], default-features = false, optional = true }
polars-utils = { version = "0.21.1", path = "../polars-utils" }
polars-arrow = { version = "0.22.0", path = "../polars-arrow" }
polars-core = { version = "0.22.1", path = "../polars-core", features = ["private"], default-features = false }
polars-time = { version = "0.22.0", path = "../polars-time", features = ["private"], default-features = false, optional = true }
polars-utils = { version = "0.22.0", path = "../polars-utils" }
rayon = "1.5"
regex = "1.5"
serde = { version = "1", features = ["derive"], optional = true }
serde_json = { version = "1", optional = true }
serde_json = { version = "1", optional = true, default-features = false, features = ["alloc"] }
simdutf8 = "0.1"

[dev-dependencies]
Expand Down
32 changes: 20 additions & 12 deletions polars/polars-io/src/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
//! +-----+--------+-------+--------+
//! ```
//!
use crate::mmap::{MmapBytesReader, ReaderBytes};
use crate::prelude::*;
use arrow::array::{ArrayRef, StructArray};
use arrow::io::ndjson::read::FallibleStreamingIterator;
Expand All @@ -72,7 +73,9 @@ use polars_arrow::conversion::chunk_to_struct;
use polars_arrow::kernels::concatenate::concatenate_owned_unchecked;
use polars_core::prelude::*;
use std::convert::TryFrom;
use std::io::{BufRead, Seek, Write};
use std::io::{Cursor, Seek, Write};
use std::ops::Deref;

pub enum JsonFormat {
Json,
JsonLines,
Expand Down Expand Up @@ -130,7 +133,7 @@ where
#[must_use]
pub struct JsonReader<R>
where
R: BufRead + Seek,
R: MmapBytesReader,
{
reader: R,
rechunk: bool,
Expand All @@ -143,7 +146,7 @@ where

impl<R> SerReader<R> for JsonReader<R>
where
R: BufRead + Seek,
R: MmapBytesReader,
{
fn new(reader: R) -> Self {
JsonReader {
Expand All @@ -162,25 +165,30 @@ where
self
}

fn finish(mut self) -> Result<DataFrame> {
fn finish(self) -> Result<DataFrame> {
let mmap_read: ReaderBytes = (&self.reader).into();
let bytes = mmap_read.deref();

let out = match self.json_format {
JsonFormat::Json => {
let v = serde_json::from_reader(&mut self.reader)
.map_err(|e| PolarsError::ComputeError(format!("{:?}", e).into()))?;
let json_value = arrow::io::json::read::json_deserializer::parse(bytes)
.map_err(|err| PolarsError::ComputeError(format!("{:?}", err).into()))?;
// likely struct type
let dtype = json::read::infer(&v)?;
let arr = json::read::deserialize(&v, dtype)?;
let dtype = json::read::infer(&json_value)?;
let arr = json::read::deserialize(&json_value, dtype)?;
let arr = arr.as_any().downcast_ref::<StructArray>().ok_or_else(|| {
PolarsError::ComputeError("only can deserialize json objects".into())
})?;
DataFrame::try_from(arr.clone())
}
JsonFormat::JsonLines => {
let dtype = ndjson::read::infer(&mut self.reader, self.infer_schema_len)?;
self.reader.rewind()?;
let mut file = Cursor::new(bytes);

let dtype = ndjson::read::infer(&mut file, self.infer_schema_len)?;
file.rewind()?;

let mut reader = ndjson::read::FileReader::new(
&mut self.reader,
&mut file,
vec!["".to_string(); self.batch_size],
None,
);
Expand Down Expand Up @@ -209,7 +217,7 @@ where

impl<R> JsonReader<R>
where
R: BufRead + Seek,
R: MmapBytesReader,
{
/// Set the JSON file's schema
pub fn with_schema(mut self, schema: &Schema) -> Self {
Expand Down
16 changes: 8 additions & 8 deletions polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars-lazy"
version = "0.21.1"
version = "0.22.0"
authors = ["ritchie46 <ritchie46@gmail.com>"]
edition = "2021"
license = "MIT"
Expand Down Expand Up @@ -120,14 +120,14 @@ parking_lot = "0.12"
pyo3 = { version = "0.16", optional = true }
rayon = "1.5"
regex = { version = "1.5", optional = true }
serde = { version = "1", features = ["derive"], optional = true }
serde = { version = "1", features = ["derive", "rc"], optional = true }

polars-arrow = { version = "0.21.1", path = "../polars-arrow" }
polars-core = { version = "0.21.1", path = "../polars-core", features = ["lazy", "private", "zip_with", "random"], default-features = false }
polars-io = { version = "0.21.1", path = "../polars-io", features = ["lazy", "csv-file", "private"], default-features = false }
polars-ops = { version = "0.21.1", path = "../polars-ops", default-features = false }
polars-time = { version = "0.21.1", path = "../polars-time", optional = true }
polars-utils = { version = "0.21.1", path = "../polars-utils" }
polars-arrow = { version = "0.22.0", path = "../polars-arrow" }
polars-core = { version = "0.22.1", path = "../polars-core", features = ["lazy", "private", "zip_with", "random"], default-features = false }
polars-io = { version = "0.22.0", path = "../polars-io", features = ["lazy", "csv-file", "private"], default-features = false }
polars-ops = { version = "0.22.1", path = "../polars-ops", default-features = false }
polars-time = { version = "0.22.0", path = "../polars-time", optional = true }
polars-utils = { version = "0.22.0", path = "../polars-utils" }

[package.metadata.docs.rs]
all-features = true
Expand Down
18 changes: 9 additions & 9 deletions polars/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars-ops"
version = "0.21.1"
version = "0.22.1"
authors = ["ritchie46 <ritchie46@gmail.com>"]
edition = "2021"
license = "MIT"
Expand All @@ -10,20 +10,20 @@ description = "More operations on polars data structures"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
polars-arrow = { version = "0.21.1", path = "../polars-arrow", default-features = false }
polars-core = { version = "0.21.1", path = "../polars-core", features = ["private"], default-features = false }
polars-arrow = { version = "0.22.0", path = "../polars-arrow", default-features = false }
polars-core = { version = "0.22.1", path = "../polars-core", features = ["private"], default-features = false }

[features]
dtype-categorical = ["polars-core/dtype-categorical"]
dtype-date = ["polars-core/dtype-date"]
dtype-datetime = ["polars-core/dtype-datetime"]
dtype-time = ["polars-core/dtype-time"]
dtype-duration = ["polars-core/dtype-duration"]
dtype-struct = ["polars-core/dtype-struct"]
dtype-date = ["polars-core/dtype-date", "polars-core/temporal"]
dtype-datetime = ["polars-core/dtype-datetime", "polars-core/temporal"]
dtype-time = ["polars-core/dtype-time", "polars-core/temporal"]
dtype-duration = ["polars-core/dtype-duration", "polars-core/temporal"]
dtype-struct = ["polars-core/dtype-struct", "polars-core/temporal"]
dtype-u8 = ["polars-core/dtype-u8"]
object = ["polars-core/object"]
to_dummies = []
list_to_struct = ["polars-core/dtype-struct", "list"]
list = []
diff = []
strings = []
strings = ["polars-core/strings"]
8 changes: 4 additions & 4 deletions polars/polars-time/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars-time"
version = "0.21.1"
version = "0.22.0"
authors = ["ritchie46 <ritchie46@gmail.com>"]
edition = "2021"
license = "MIT"
Expand All @@ -11,9 +11,9 @@ description = "Time related code for the polars dataframe library"
[dependencies]
chrono = "0.4"
lexical = { version = "6", default-features = false, features = ["std", "parse-floats", "parse-integers"] }
polars-arrow = { version = "0.21.1", path = "../polars-arrow", features = ["compute", "temporal"] }
polars-core = { version = "0.21.1", path = "../polars-core", default-features = false, features = ["private", "dtype-datetime", "dtype-duration", "dtype-time", "dtype-date"] }
polars-utils = { version = "0.21.1", path = "../polars-utils" }
polars-arrow = { version = "0.22.0", path = "../polars-arrow", features = ["compute", "temporal"] }
polars-core = { version = "0.22.0", path = "../polars-core", default-features = false, features = ["private", "dtype-datetime", "dtype-duration", "dtype-time", "dtype-date"] }
polars-utils = { version = "0.22.0", path = "../polars-utils" }
serde = { version = "1", features = ["derive"], optional = true }

[features]
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars-utils"
version = "0.21.1"
version = "0.22.0"
authors = ["ritchie46 <ritchie46@gmail.com>"]
edition = "2021"
license = "MIT"
Expand Down

0 comments on commit 61e7627

Please sign in to comment.