Skip to content

Commit

Permalink
Assign dtypes to expected columns when dtypes is a list and column se…
Browse files Browse the repository at this point in the history
…lection is used when reading a CSV file. (#3901)

Assign dtypes to expected columns when dtypes is a list and column
selection is used when reading a CSV file.

As dtypes are passed to the polars CSV reader, projection and column
selection was not applied on the dtypes in case it is a list,
so the dtypes would be applied to the first x columns, instead of
the corresponding columns selected by the projection/column selection.

A remaining issue that is not fixed:
  Projection in combination with a list of dtypes which contain a
  pl.Date type, are converted to null dates.
  In case column names were used for the selection, the pl.Date type
  will correctly convert that column to dates.

Fixes: #3891
  • Loading branch information
ghuls committed Jul 5, 2022
1 parent 6adbff3 commit d9219fe
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 1 deletion.
28 changes: 27 additions & 1 deletion py-polars/polars/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
_PYARROW_AVAILABLE = False

from polars.convert import from_arrow
from polars.datatypes import DataType
from polars.datatypes import DataType, Utf8
from polars.internals import DataFrame, LazyFrame, _scan_ds
from polars.internals.io import _prepare_file_arg

Expand Down Expand Up @@ -260,6 +260,32 @@ def read_csv(
return update_columns(df, new_columns)
return df

if projection and dtypes and isinstance(dtypes, list):
if len(projection) < len(dtypes):
raise ValueError(
"More dtypes overrides are specified than there are selected columns."
)

# Fix list of dtypes when used together with projection as polars CSV reader
# wants a list of dtypes for the x first columns before it does the projection.
dtypes_list: list[type[DataType]] = [Utf8] * (max(projection) + 1)

for idx, column_idx in enumerate(projection):
if idx < len(dtypes):
dtypes_list[column_idx] = dtypes[idx]

dtypes = dtypes_list

if columns and dtypes and isinstance(dtypes, list):
if len(columns) < len(dtypes):
raise ValueError(
"More dtypes overrides are specified than there are selected columns."
)

# Map list of dtypes when used together with selected columns as a dtypes dict
# so the dtypes are applied to the correct column instead of the first x columns.
dtypes = {column: dtype for column, dtype in zip(columns, dtypes)}

if new_columns and dtypes and isinstance(dtypes, dict):
current_columns = None

Expand Down
26 changes: 26 additions & 0 deletions py-polars/tests/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,32 @@ def test_partial_dtype_overwrite() -> None:
assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64]


def test_dtype_overwrite_with_column_name_selection() -> None:
csv = """
a,b,c,d
1,2,3,4
1,2,3,4
"""
f = io.StringIO(csv)
df = pl.read_csv(f, columns=["c", "b", "d"], dtypes=[pl.Int32, pl.Utf8])
assert df.dtypes == [pl.Utf8, pl.Int32, pl.Int64]


def test_dtype_overwrite_with_column_idx_selection() -> None:
csv = """
a,b,c,d
1,2,3,4
1,2,3,4
"""
f = io.StringIO(csv)
df = pl.read_csv(f, columns=[2, 1, 3], dtypes=[pl.Int32, pl.Utf8])
# Columns without an explicit dtype set will get pl.Utf8 if dtypes is a list
# if the column selection is done with column indices instead of column names.
assert df.dtypes == [pl.Utf8, pl.Int32, pl.Utf8]
# Projections are sorted.
assert df.columns == ["b", "c", "d"]


def test_partial_column_rename() -> None:
csv = """
a,b,c
Expand Down

0 comments on commit d9219fe

Please sign in to comment.