Allow infer_schema_length=None in pl.read_csv().

pola-rs · Nov 5, 2021 · ee26601 · ee26601
1 parent d0ed0e9
commit ee26601
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 6 deletions.
diff --git a/py-polars/polars/eager/frame.py b/py-polars/polars/eager/frame.py
@@ -352,7 +352,7 @@ def _from_pandas(
     @staticmethod
     def read_csv(
         file: Union[str, BinaryIO, bytes],
-        infer_schema_length: int = 100,
+        infer_schema_length: Optional[int] = 100,
         batch_size: int = 64,
         has_headers: bool = True,
         ignore_errors: bool = False,
@@ -379,7 +379,8 @@ def read_csv(
         file
             Path to a file or a file like object. Any valid filepath can be used. Example: `file.csv`.
         infer_schema_length
-            Maximum number of lines to read to infer schema.
+            Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
+            If set to `None`, a full table scan will be done (slow).
         batch_size
             Number of lines to read into the buffer at once. Modify this to change performance.
         has_headers

diff --git a/py-polars/polars/io.py b/py-polars/polars/io.py
@@ -145,7 +145,7 @@ def update_columns(df: "pl.DataFrame", new_columns: List[str]) -> "pl.DataFrame"
 
 def read_csv(
     file: Union[str, TextIO, Path, BinaryIO, bytes],
-    infer_schema_length: int = 100,
+    infer_schema_length: Optional[int] = 100,
     batch_size: int = 8192,
     has_headers: bool = True,
     ignore_errors: bool = False,
@@ -181,7 +181,8 @@ def read_csv(
         or ``StringIO`` or ``BytesIO``.
         If ``fsspec`` is installed, it will be used to open remote files
     infer_schema_length
-        Maximum number of lines to read to infer schema.
+        Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
+        If set to `None`, a full table scan will be done (slow).
     batch_size
         Number of lines to read into the buffer at once. Modify this to change performance.
     has_headers

diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
@@ -80,7 +80,7 @@ impl PyDataFrame {
     #[allow(clippy::too_many_arguments)]
     pub fn read_csv(
         py_f: &PyAny,
-        infer_schema_length: usize,
+        infer_schema_length: Option<usize>,
         chunk_size: usize,
         has_header: bool,
         ignore_errors: bool,
@@ -147,7 +147,7 @@ impl PyDataFrame {
 
         let mmap_bytes_r = get_mmap_bytes_reader(py_f)?;
         let df = CsvReader::new(mmap_bytes_r)
-            .infer_schema(Some(infer_schema_length))
+            .infer_schema(infer_schema_length)
             .has_header(has_header)
             .with_stop_after_n_rows(stop_after_n_rows)
             .with_delimiter(sep.as_bytes()[0])