Assign dtypes to expected columns when dtypes is a list and column se…

…lection is used when reading a CSV file. (#3901) Assign dtypes to expected columns when dtypes is a list and column selection is used when reading a CSV file. As dtypes are passed to the polars CSV reader, projection and column selection was not applied on the dtypes in case it is a list, so the dtypes would be applied to the first x columns, instead of the corresponding columns selected by the projection/column selection. A remaining issue that is not fixed: Projection in combination with a list of dtypes which contain a pl.Date type, are converted to null dates. In case column names were used for the selection, the pl.Date type will correctly convert that column to dates. Fixes: #3891
pola-rs · Jul 5, 2022 · d9219fe · d9219fe
1 parent 6adbff3
commit d9219fe
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 1 deletion.
diff --git a/py-polars/polars/io.py b/py-polars/polars/io.py
@@ -17,7 +17,7 @@
     _PYARROW_AVAILABLE = False
 
 from polars.convert import from_arrow
-from polars.datatypes import DataType
+from polars.datatypes import DataType, Utf8
 from polars.internals import DataFrame, LazyFrame, _scan_ds
 from polars.internals.io import _prepare_file_arg
 
@@ -260,6 +260,32 @@ def read_csv(
             return update_columns(df, new_columns)
         return df
 
+    if projection and dtypes and isinstance(dtypes, list):
+        if len(projection) < len(dtypes):
+            raise ValueError(
+                "More dtypes overrides are specified than there are selected columns."
+            )
+
+        # Fix list of dtypes when used together with projection as polars CSV reader
+        # wants a list of dtypes for the x first columns before it does the projection.
+        dtypes_list: list[type[DataType]] = [Utf8] * (max(projection) + 1)
+
+        for idx, column_idx in enumerate(projection):
+            if idx < len(dtypes):
+                dtypes_list[column_idx] = dtypes[idx]
+
+        dtypes = dtypes_list
+
+    if columns and dtypes and isinstance(dtypes, list):
+        if len(columns) < len(dtypes):
+            raise ValueError(
+                "More dtypes overrides are specified than there are selected columns."
+            )
+
+        # Map list of dtypes when used together with selected columns as a dtypes dict
+        # so the dtypes are applied to the correct column instead of the first x columns.
+        dtypes = {column: dtype for column, dtype in zip(columns, dtypes)}
+
     if new_columns and dtypes and isinstance(dtypes, dict):
         current_columns = None
 

diff --git a/py-polars/tests/io/test_csv.py b/py-polars/tests/io/test_csv.py
@@ -103,6 +103,32 @@ def test_partial_dtype_overwrite() -> None:
     assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64]
 
 
+def test_dtype_overwrite_with_column_name_selection() -> None:
+    csv = """
+a,b,c,d
+1,2,3,4
+1,2,3,4
+"""
+    f = io.StringIO(csv)
+    df = pl.read_csv(f, columns=["c", "b", "d"], dtypes=[pl.Int32, pl.Utf8])
+    assert df.dtypes == [pl.Utf8, pl.Int32, pl.Int64]
+
+
+def test_dtype_overwrite_with_column_idx_selection() -> None:
+    csv = """
+a,b,c,d
+1,2,3,4
+1,2,3,4
+"""
+    f = io.StringIO(csv)
+    df = pl.read_csv(f, columns=[2, 1, 3], dtypes=[pl.Int32, pl.Utf8])
+    # Columns without an explicit dtype set will get pl.Utf8 if dtypes is a list
+    # if the column selection is done with column indices instead of column names.
+    assert df.dtypes == [pl.Utf8, pl.Int32, pl.Utf8]
+    # Projections are sorted.
+    assert df.columns == ["b", "c", "d"]
+
+
 def test_partial_column_rename() -> None:
     csv = """
 a,b,c