Skip to content

Commit

Permalink
[python] add more serde options
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Sep 16, 2020
1 parent df6268d commit a3bd270
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 12 deletions.
6 changes: 3 additions & 3 deletions polars/src/frame/ser/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,9 @@ where
self
}

/// Set the size of the write buffers. Buffer size is the amount of rows written at once.
pub fn with_buffer_size(mut self, buffer_size: usize) -> Self {
self.buffer_size = buffer_size;
/// Set the size of the write buffers. Batch size is the amount of rows written at once.
pub fn with_batch_size(mut self, batch_size: usize) -> Self {
self.buffer_size = batch_size;
self
}
}
Expand Down
2 changes: 1 addition & 1 deletion py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
polars = {path = "../polars"}
polars = {path = "../polars", features = ["parquet"]}
pyo3 = {version = "0.11", features = ["extension-module"] }
thiserror = "1.0.20"
numpy = "0.11"
Expand Down
35 changes: 31 additions & 4 deletions py-polars/polars/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,41 @@ def from_pydf(df: PyDataFrame) -> DataFrame:

@staticmethod
def from_csv(
path: str, infer_schema_length: int = 100, batch_size: int = 1000
path: str,
infer_schema_length: int = 100,
batch_size: int = 100000,
has_headers: bool = True,
ignore_errors: bool = False,
) -> DataFrame:
self = DataFrame.__new__(DataFrame)
self._df = PyDataFrame.from_csv(path, infer_schema_length, batch_size)
self._df = PyDataFrame.from_csv(
path, infer_schema_length, batch_size, has_headers, ignore_errors
)
return self

def to_csv(self, path: str, has_headers: bool = True, delimiter: str = ","):
self._df.to_csv(path, has_headers, ord(delimiter))
@staticmethod
def from_parquet(path: str, batch_size: int = 250000,) -> DataFrame:
self = DataFrame.__new__(DataFrame)
self._df = PyDataFrame.from_parquet(path, batch_size)
return self

@staticmethod
def from_ipc(path: str) -> DataFrame:
self = DataFrame.__new__(DataFrame)
self._df = PyDataFrame.from_ipc(path)
return self

def to_csv(
self,
path: str,
batch_size: int = 100000,
has_headers: bool = True,
delimiter: str = ",",
):
self._df.to_csv(path, batch_size, has_headers, ord(delimiter))

def to_ipc(self, path: str, batch_size):
self._df.to_ipc(path, batch_size)

def __str__(self) -> str:
return self._df.as_str()
Expand Down
52 changes: 48 additions & 4 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,70 @@ impl PyDataFrame {
}

#[staticmethod]
pub fn from_csv(path: &str, infer_schema_length: usize, batch_size: usize) -> PyResult<Self> {
pub fn from_csv(
path: &str,
infer_schema_length: usize,
batch_size: usize,
has_header: bool,
ignore_errors: bool,
) -> PyResult<Self> {
// TODO: use python file objects:
// https://github.com/mre/hyperjson/blob/e1a0515f8d033f24b9fba64a0a4c77df841bbd1b/src/lib.rs#L20
let file = std::fs::File::open(path)?;

let df = CsvReader::new(file)
let reader = CsvReader::new(file)
.infer_schema(Some(infer_schema_length))
.has_header(true)
.has_header(has_header)
.with_batch_size(batch_size);

let reader = if ignore_errors {
reader.with_ignore_parser_error()
} else {
reader
};
let df = reader.finish().map_err(PyPolarsEr::from)?;
Ok(PyDataFrame::new(df))
}

#[staticmethod]
pub fn from_parquet(path: &str, batch_size: usize) -> PyResult<Self> {
let file = std::fs::File::open(path)?;
let df = ParquetReader::new(file)
.with_batch_size(batch_size)
.finish()
.map_err(PyPolarsEr::from)?;
Ok(PyDataFrame::new(df))
}

pub fn to_csv(&mut self, path: &str, has_headers: bool, delimiter: u8) -> PyResult<()> {
#[staticmethod]
pub fn from_ipc(path: &str) -> PyResult<Self> {
let file = std::fs::File::open(path)?;
let df = IPCReader::new(file).finish().map_err(PyPolarsEr::from)?;
Ok(PyDataFrame::new(df))
}

pub fn to_csv(
&mut self,
path: &str,
batch_size: usize,
has_headers: bool,
delimiter: u8,
) -> PyResult<()> {
// TODO: use python file objects:
let mut buf = std::fs::File::create(path)?;
CsvWriter::new(&mut buf)
.has_headers(has_headers)
.with_delimiter(delimiter)
.with_batch_size(batch_size)
.finish(&mut self.df)
.map_err(PyPolarsEr::from)?;
Ok(())
}

pub fn to_ipc(&mut self, path: &str, batch_size: usize) -> PyResult<()> {
let mut buf = std::fs::File::create(path)?;
IPCWriter::new(&mut buf)
.with_batch_size(batch_size)
.finish(&mut self.df)
.map_err(PyPolarsEr::from)?;
Ok(())
Expand Down

0 comments on commit a3bd270

Please sign in to comment.