Skip to content

Commit

Permalink
python read_json accept files (#2761)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 24, 2022
1 parent f2ad684 commit 43fd1aa
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 28 deletions.
8 changes: 7 additions & 1 deletion polars/polars-io/src/mmap.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use std::fs::File;
use std::io::{Cursor, Read, Seek};
use std::io::{BufReader, Cursor, Read, Seek};

/// Trait used to get a hold to file handler or to the underlying bytes
/// without performing a Read.
Expand All @@ -19,6 +19,12 @@ impl MmapBytesReader for File {
}
}

impl MmapBytesReader for BufReader<File> {
fn to_file(&self) -> Option<&File> {
Some(self.get_ref())
}
}

impl<T> MmapBytesReader for Cursor<T>
where
T: AsRef<[u8]> + Send + Sync,
Expand Down
2 changes: 0 additions & 2 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,8 +640,6 @@ def _read_json(file: Union[str, BytesIO]) -> "DataFrame":
file
Path to a file or a file like object.
"""
if not isinstance(file, str):
file = file.read().decode("utf8")
self = DataFrame.__new__(DataFrame)
self._df = PyDataFrame.read_json(file)
return self
Expand Down
9 changes: 7 additions & 2 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use numpy::IntoPyArray;
use pyo3::types::{PyList, PyTuple};
use pyo3::{exceptions::PyRuntimeError, prelude::*};
use std::io::Read;

use polars::frame::groupby::GroupBy;
use polars::prelude::*;
Expand Down Expand Up @@ -272,11 +273,15 @@ impl PyDataFrame {

#[staticmethod]
#[cfg(feature = "json")]
pub fn read_json(json: &str) -> PyResult<Self> {
pub fn read_json(py_f: PyObject) -> PyResult<Self> {
// it is faster to first read to memory and then parse: https://github.com/serde-rs/json/issues/160
// so don't bother with files.
let mut json = String::new();
let _ = get_file_like(py_f, false)?
.read_to_string(&mut json)
.unwrap();
let df: DataFrame =
serde_json::from_str(json).map_err(|e| PyPolarsEr::Other(format!("{:?}", e)))?;
serde_json::from_str(&json).map_err(|e| PyPolarsEr::Other(format!("{:?}", e)))?;
Ok(df.into())
}

Expand Down
10 changes: 5 additions & 5 deletions py-polars/src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use pyo3::types::{PyBytes, PyString};
use std::borrow::Borrow;
use std::fs::File;
use std::io;
use std::io::{Cursor, Read, Seek, SeekFrom, Write};
use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write};

#[derive(Clone)]
pub struct PyFileLikeObject {
Expand Down Expand Up @@ -193,7 +193,7 @@ impl MmapBytesReader for PyFileLikeObject {}

pub enum EitherRustPythonFile {
Py(PyFileLikeObject),
Rust(File),
Rust(BufReader<File>),
}

///
Expand All @@ -207,9 +207,9 @@ pub fn get_either_file(py_f: PyObject, truncate: bool) -> PyResult<EitherRustPyt
let rstring = pstring.to_string();
let str_slice: &str = rstring.borrow();
let f = if truncate {
File::create(str_slice)?
BufReader::new(File::create(str_slice)?)
} else {
File::open(str_slice)?
BufReader::new(File::open(str_slice)?)
};
Ok(EitherRustPythonFile::Rust(f))
} else {
Expand All @@ -222,7 +222,7 @@ pub fn get_file_like(f: PyObject, truncate: bool) -> PyResult<Box<dyn FileLike>>
use EitherRustPythonFile::*;
match get_either_file(f, truncate)? {
Py(f) => Ok(Box::new(f)),
Rust(f) => Ok(Box::new(f)),
Rust(f) => Ok(Box::new(f.into_inner())),
}
}

Expand Down
22 changes: 19 additions & 3 deletions py-polars/tests/io/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ def test_to_from_file(io_test_dir: str, df: pl.DataFrame) -> None:
f = os.path.join(io_test_dir, "small.json")
df.to_json(f)

# Not sure why error occur
with pytest.raises(RuntimeError):
_ = pl.read_json(f)
out = pl.read_json(f)
assert out.frame_equal(df)

# read_df = read_df.with_columns(
# [pl.col("cat").cast(pl.Categorical), pl.col("time").cast(pl.Time)]
Expand All @@ -50,3 +49,20 @@ def test_to_json() -> None:
{"a":3,"b":null}
"""
)


def test_to_json2(df: pl.DataFrame) -> None:
# text based conversion loses time info
df = df.select(pl.all().exclude(["cat", "time"]))
s = df.to_json(to_string=True)
f = io.BytesIO()
f.write(s.encode())
f.seek(0)
out = pl.read_json(f)
assert df.frame_equal(out, null_equal=True)

file = io.BytesIO()
df.to_json(file)
file.seek(0)
out = pl.read_json(file)
assert df.frame_equal(out, null_equal=True)
16 changes: 1 addition & 15 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# flake8: noqa: W191,E101
import io
import sys
from builtins import range
from datetime import datetime
Expand Down Expand Up @@ -1075,21 +1076,6 @@ def test_rename(df: pl.DataFrame) -> None:
_ = out[["foos", "bars"]]


def test_to_json(df: pl.DataFrame) -> None:
# text based conversion loses time info
df = df.select(pl.all().exclude(["cat", "time"]))
s = df.to_json(to_string=True)
out = pl.read_json(s)
assert df.frame_equal(out, null_equal=True)

file = BytesIO()
df.to_json(file)
file.seek(0)
s = file.read().decode("utf8")
out = pl.read_json(s)
assert df.frame_equal(out, null_equal=True)


def test_to_csv() -> None:
df = pl.DataFrame(
{
Expand Down

0 comments on commit 43fd1aa

Please sign in to comment.