[python] add more serde options

pola-rs · Sep 16, 2020 · a3bd270 · a3bd270
1 parent df6268d
commit a3bd270
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 12 deletions.
diff --git a/polars/src/frame/ser/csv.rs b/polars/src/frame/ser/csv.rs
@@ -121,9 +121,9 @@ where
         self
     }
 
-    /// Set the size of the write buffers. Buffer size is the amount of rows written at once.
-    pub fn with_buffer_size(mut self, buffer_size: usize) -> Self {
-        self.buffer_size = buffer_size;
+    /// Set the size of the write buffers. Batch size is the amount of rows written at once.
+    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
+        self.buffer_size = batch_size;
         self
     }
 }

diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2018"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-polars = {path = "../polars"}
+polars = {path = "../polars", features = ["parquet"]}
 pyo3 = {version = "0.11", features = ["extension-module"] }
 thiserror = "1.0.20"
 numpy = "0.11"

diff --git a/py-polars/polars/frame.py b/py-polars/polars/frame.py
@@ -25,14 +25,41 @@ def from_pydf(df: PyDataFrame) -> DataFrame:
 
     @staticmethod
     def from_csv(
-        path: str, infer_schema_length: int = 100, batch_size: int = 1000
+        path: str,
+        infer_schema_length: int = 100,
+        batch_size: int = 100000,
+        has_headers: bool = True,
+        ignore_errors: bool = False,
     ) -> DataFrame:
         self = DataFrame.__new__(DataFrame)
-        self._df = PyDataFrame.from_csv(path, infer_schema_length, batch_size)
+        self._df = PyDataFrame.from_csv(
+            path, infer_schema_length, batch_size, has_headers, ignore_errors
+        )
         return self
 
-    def to_csv(self, path: str, has_headers: bool = True, delimiter: str = ","):
-        self._df.to_csv(path, has_headers, ord(delimiter))
+    @staticmethod
+    def from_parquet(path: str, batch_size: int = 250000,) -> DataFrame:
+        self = DataFrame.__new__(DataFrame)
+        self._df = PyDataFrame.from_parquet(path, batch_size)
+        return self
+
+    @staticmethod
+    def from_ipc(path: str) -> DataFrame:
+        self = DataFrame.__new__(DataFrame)
+        self._df = PyDataFrame.from_ipc(path)
+        return self
+
+    def to_csv(
+        self,
+        path: str,
+        batch_size: int = 100000,
+        has_headers: bool = True,
+        delimiter: str = ",",
+    ):
+        self._df.to_csv(path, batch_size, has_headers, ord(delimiter))
+
+    def to_ipc(self, path: str, batch_size):
+        self._df.to_ipc(path, batch_size)
 
     def __str__(self) -> str:
         return self._df.as_str()

diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
@@ -28,26 +28,70 @@ impl PyDataFrame {
     }
 
     #[staticmethod]
-    pub fn from_csv(path: &str, infer_schema_length: usize, batch_size: usize) -> PyResult<Self> {
+    pub fn from_csv(
+        path: &str,
+        infer_schema_length: usize,
+        batch_size: usize,
+        has_header: bool,
+        ignore_errors: bool,
+    ) -> PyResult<Self> {
         // TODO: use python file objects:
         // https://github.com/mre/hyperjson/blob/e1a0515f8d033f24b9fba64a0a4c77df841bbd1b/src/lib.rs#L20
         let file = std::fs::File::open(path)?;
 
-        let df = CsvReader::new(file)
+        let reader = CsvReader::new(file)
             .infer_schema(Some(infer_schema_length))
-            .has_header(true)
+            .has_header(has_header)
+            .with_batch_size(batch_size);
+
+        let reader = if ignore_errors {
+            reader.with_ignore_parser_error()
+        } else {
+            reader
+        };
+        let df = reader.finish().map_err(PyPolarsEr::from)?;
+        Ok(PyDataFrame::new(df))
+    }
+
+    #[staticmethod]
+    pub fn from_parquet(path: &str, batch_size: usize) -> PyResult<Self> {
+        let file = std::fs::File::open(path)?;
+        let df = ParquetReader::new(file)
             .with_batch_size(batch_size)
             .finish()
             .map_err(PyPolarsEr::from)?;
         Ok(PyDataFrame::new(df))
     }
 
-    pub fn to_csv(&mut self, path: &str, has_headers: bool, delimiter: u8) -> PyResult<()> {
+    #[staticmethod]
+    pub fn from_ipc(path: &str) -> PyResult<Self> {
+        let file = std::fs::File::open(path)?;
+        let df = IPCReader::new(file).finish().map_err(PyPolarsEr::from)?;
+        Ok(PyDataFrame::new(df))
+    }
+
+    pub fn to_csv(
+        &mut self,
+        path: &str,
+        batch_size: usize,
+        has_headers: bool,
+        delimiter: u8,
+    ) -> PyResult<()> {
         // TODO: use python file objects:
         let mut buf = std::fs::File::create(path)?;
         CsvWriter::new(&mut buf)
             .has_headers(has_headers)
             .with_delimiter(delimiter)
+            .with_batch_size(batch_size)
+            .finish(&mut self.df)
+            .map_err(PyPolarsEr::from)?;
+        Ok(())
+    }
+
+    pub fn to_ipc(&mut self, path: &str, batch_size: usize) -> PyResult<()> {
+        let mut buf = std::fs::File::create(path)?;
+        IPCWriter::new(&mut buf)
+            .with_batch_size(batch_size)
             .finish(&mut self.df)
             .map_err(PyPolarsEr::from)?;
         Ok(())