Skip to content

Commit

Permalink
python [breaking]: rename some kwargs/names
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 9, 2021
1 parent a074b73 commit 222aa3b
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 30 deletions.
14 changes: 7 additions & 7 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ def read_csv(
rechunk: bool = True,
encoding: str = "utf8",
n_threads: Optional[int] = None,
dtype: Union[Dict[str, Type[DataType]], tp.List[Type[DataType]], None] = None,
dtypes: Union[Dict[str, Type[DataType]], tp.List[Type[DataType]], None] = None,
low_memory: bool = False,
comment_char: Optional[str] = None,
quote_char: Optional[str] = r'"',
Expand Down Expand Up @@ -419,7 +419,7 @@ def read_csv(
Allowed encodings: `utf8`, `utf8-lossy`. Lossy means that invalid utf8 values are replaced with `�` character.
n_threads
Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.
dtype
dtypes
Overwrite the dtypes during inference.
low_memory
Reduce memory usage in expense of performance.
Expand Down Expand Up @@ -463,13 +463,13 @@ def read_csv(

dtype_list: Optional[tp.List[Tuple[str, Type[DataType]]]] = None
dtype_slice: Optional[tp.List[Type[DataType]]] = None
if dtype is not None:
if isinstance(dtype, dict):
if dtypes is not None:
if isinstance(dtypes, dict):
dtype_list = []
for k, v in dtype.items():
for k, v in dtypes.items():
dtype_list.append((k, py_type_to_dtype(v)))
elif isinstance(dtype, list):
dtype_slice = dtype
elif isinstance(dtypes, list):
dtype_slice = dtypes
else:
raise ValueError("dtype arg should be list or dict")

Expand Down
38 changes: 21 additions & 17 deletions py-polars/polars/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def read_csv(
rechunk: bool = True,
encoding: str = "utf8",
n_threads: Optional[int] = None,
dtype: Optional[Union[Dict[str, Type[DataType]], List[Type[DataType]]]] = None,
dtypes: Optional[Union[Dict[str, Type[DataType]], List[Type[DataType]]]] = None,
new_columns: Optional[List[str]] = None,
use_pyarrow: bool = False,
low_memory: bool = False,
Expand All @@ -156,6 +156,7 @@ def read_csv(
storage_options: Optional[Dict] = None,
null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
parse_dates: bool = False,
**kwargs: Any,
) -> DataFrame:
"""
Read into a DataFrame from a csv file.
Expand Down Expand Up @@ -197,7 +198,7 @@ def read_csv(
- "utf8-lossy"
n_threads
Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.
dtype
dtypes
Overwrite the dtypes during inference.
new_columns
Rename columns to these right after parsing. If the given list is shorted than the width of the DataFrame the
Expand Down Expand Up @@ -229,6 +230,9 @@ def read_csv(
-------
DataFrame
"""
# for backward compatibility
dtypes = kwargs.get("dtypes", dtypes)

if isinstance(file, bytes) and len(file) == 0:
raise ValueError("no date in bytes")

Expand All @@ -249,7 +253,7 @@ def read_csv(

if (
use_pyarrow
and dtype is None
and dtypes is None
and stop_after_n_rows is None
and n_threads is None
and encoding == "utf8"
Expand Down Expand Up @@ -296,11 +300,11 @@ def read_csv(
return update_columns(df, new_columns) # type: ignore
return df # type: ignore

if new_columns and dtype and isinstance(dtype, dict):
if new_columns and dtypes and isinstance(dtypes, dict):
current_columns = None

# As new column names are not available yet while parsing the CSV file, rename column names in
# dtype to old names (if possible) so they can be used during CSV parsing.
# dtypes to old names (if possible) so they can be used during CSV parsing.
if columns:
if len(columns) < len(new_columns):
raise ValueError(
Expand Down Expand Up @@ -330,28 +334,28 @@ def read_csv(
else:
# When a header is present, column names are not known yet.

if len(dtype) <= len(new_columns):
# If dtype dictionary contains less or same amount of values than new column names
# a list of dtypes can be created if all listed column names in dtype dictionary
if len(dtypes) <= len(new_columns):
# If dtypes dictionary contains less or same amount of values than new column names
# a list of dtypes can be created if all listed column names in dtypes dictionary
# appear in the first consecutive new column names.
dtype_list = [
dtype[new_column_name]
for new_column_name in new_columns[0 : len(dtype)]
if new_column_name in dtype
dtypes[new_column_name]
for new_column_name in new_columns[0 : len(dtypes)]
if new_column_name in dtypes
]

if len(dtype_list) == len(dtype):
dtype = dtype_list
if len(dtype_list) == len(dtypes):
dtypes = dtype_list

if current_columns and isinstance(dtype, dict):
if current_columns and isinstance(dtypes, dict):
new_to_current = {
new_column: current_column
for new_column, current_column in zip(new_columns, current_columns)
}
# Change new column names to current column names in dtype.
dtype = {
dtypes = {
new_to_current.get(column_name, column_name): column_dtype
for column_name, column_dtype in dtype.items()
for column_name, column_dtype in dtypes.items()
}

with _prepare_file_arg(file, **storage_options) as data:
Expand All @@ -369,7 +373,7 @@ def read_csv(
rechunk=rechunk,
encoding=encoding,
n_threads=n_threads,
dtype=dtype,
dtypes=dtypes,
low_memory=low_memory,
comment_char=comment_char,
quote_char=quote_char,
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,7 @@ def test_read_csv_categorical() -> None:
f = BytesIO()
f.write(b"col1,col2,col3,col4,col5,col6\n'foo',2,3,4,5,6\n'bar',8,9,10,11,12")
f.seek(0)
df = pl.DataFrame.read_csv(f, has_headers=True, dtype={"col1": pl.Categorical})
df = pl.DataFrame.read_csv(f, has_headers=True, dtypes={"col1": pl.Categorical})
assert df["col1"].dtype == pl.Categorical


Expand Down
10 changes: 5 additions & 5 deletions py-polars/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def test_partial_dtype_overwrite() -> None:
1,2,3
"""
f = io.StringIO(csv)
df = pl.read_csv(f, dtype=[pl.Utf8])
df = pl.read_csv(f, dtypes=[pl.Utf8])
assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64]


Expand All @@ -216,7 +216,7 @@ def test_column_rename_and_dtype_overwrite() -> None:
df = pl.read_csv(
f,
new_columns=["A", "B", "C"],
dtype={"A": pl.Utf8, "B": pl.Int64, "C": pl.Float32},
dtypes={"A": pl.Utf8, "B": pl.Int64, "C": pl.Float32},
)
assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float32]

Expand All @@ -225,7 +225,7 @@ def test_column_rename_and_dtype_overwrite() -> None:
f,
columns=["a", "c"],
new_columns=["A", "C"],
dtype={"A": pl.Utf8, "C": pl.Float32},
dtypes={"A": pl.Utf8, "C": pl.Float32},
)
assert df.dtypes == [pl.Utf8, pl.Float32]

Expand All @@ -237,7 +237,7 @@ def test_column_rename_and_dtype_overwrite() -> None:
df = pl.read_csv(
f,
new_columns=["A", "B", "C"],
dtype={"A": pl.Utf8, "C": pl.Float32},
dtypes={"A": pl.Utf8, "C": pl.Float32},
has_headers=False,
)
assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float32]
Expand Down Expand Up @@ -396,7 +396,7 @@ def test_ignore_parse_dates() -> None:
dtypes: Dict[str, Type[DataType]] = {
k: pl.Utf8 for k in headers
} # Forces Utf8 type for every column
df = pl.read_csv(csv, columns=headers, dtype=dtypes)
df = pl.read_csv(csv, columns=headers, dtypes=dtypes)
assert df.dtypes == [pl.Utf8, pl.Utf8, pl.Utf8]


Expand Down

0 comments on commit 222aa3b

Please sign in to comment.