Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Rename Utf8 data type to String, keep Utf8 as alias #13257

Merged
merged 7 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/src/python/user-guide/expressions/casting.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@
)

out = df.select(
pl.col("integers").cast(pl.Utf8),
pl.col("float").cast(pl.Utf8),
pl.col("integers").cast(pl.String),
pl.col("float").cast(pl.String),
pl.col("floats_as_string").cast(pl.Float64),
)
print(out)
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/datatypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,6 @@ Other
Enum
Null
Object
String
Utf8
Unknown
2 changes: 2 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
List,
Null,
Object,
String,
Struct,
Time,
UInt8,
Expand Down Expand Up @@ -250,6 +251,7 @@
"List",
"Null",
"Object",
"String",
"Struct",
"Time",
"UInt16",
Expand Down
18 changes: 9 additions & 9 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import polars._reexport as pl
from polars import functions as F
from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, Struct, Utf8
from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, String, Struct
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
from polars.exceptions import NoDataError
Expand Down Expand Up @@ -152,7 +152,7 @@ def from_dicts(
>>> pl.from_dicts(
... data,
... schema=["a", "b", "c", "d"],
... schema_overrides={"c": pl.Float64, "d": pl.Utf8},
... schema_overrides={"c": pl.Float64, "d": pl.String},
... )
shape: (3, 4)
┌─────┬─────┬──────┬──────┐
Expand Down Expand Up @@ -286,15 +286,15 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame:
if coldata:
coldata.pop(idx)

# init cols as utf8 Series, handle "null" -> None, create schema from repr dtype
# init cols as String Series, handle "null" -> None, create schema from repr dtype
data = [
pl.Series([(None if v == "null" else v) for v in cd], dtype=Utf8)
pl.Series([(None if v == "null" else v) for v in cd], dtype=String)
for cd in coldata
]
schema = dict(zip(headers, (dtype_short_repr_to_dtype(d) for d in dtypes)))
if schema and data and (n_extend_cols := (len(schema) - len(data))) > 0:
empty_data = [None] * len(data[0])
data.extend((pl.Series(empty_data, dtype=Utf8)) for _ in range(n_extend_cols))
data.extend((pl.Series(empty_data, dtype=String)) for _ in range(n_extend_cols))
for dtype in set(schema.values()):
if dtype in (List, Struct, Object):
raise NotImplementedError(
Expand All @@ -306,10 +306,10 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame:
if no_dtypes:
if df.is_empty():
# if no dtypes *and* empty, default to string
return df.with_columns(F.all().cast(Utf8))
return df.with_columns(F.all().cast(String))
else:
# otherwise, take a trip through our CSV inference logic
if all(tp == Utf8 for tp in df.schema.values()):
if all(tp == String for tp in df.schema.values()):
buf = io.BytesIO()
df.write_csv(file=buf)
df = read_csv(buf, new_columns=df.columns, try_parse_dates=True)
Expand Down Expand Up @@ -347,10 +347,10 @@ def _from_series_repr(m: re.Match[str]) -> Series:
if not values:
return pl.Series(name=name, values=values, dtype=dtype)
else:
srs = pl.Series(name=name, values=values, dtype=Utf8)
srs = pl.Series(name=name, values=values, dtype=String)
if dtype is None:
return srs
elif dtype in (Categorical, Utf8):
elif dtype in (Categorical, String):
return srs.str.replace('^"(.*)"$', r"$1").cast(dtype)

return _cast_repr_strings_with_schema(
Expand Down
28 changes: 14 additions & 14 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
Float64,
Null,
Object,
String,
Unknown,
Utf8,
py_type_to_dtype,
)
from polars.dependencies import (
Expand Down Expand Up @@ -1228,7 +1228,7 @@ def dtypes(self) -> list[DataType]:
... }
... )
>>> df.dtypes
[Int64, Float64, Utf8]
[Int64, Float64, String]
>>> df
shape: (3, 3)
┌─────┬─────┬─────┐
Expand Down Expand Up @@ -1271,7 +1271,7 @@ def schema(self) -> OrderedDict[str, DataType]:
... }
... )
>>> df.schema
OrderedDict({'foo': Int64, 'bar': Float64, 'ham': Utf8})
OrderedDict({'foo': Int64, 'bar': Float64, 'ham': String})

"""
return OrderedDict(zip(self.columns, self.dtypes))
Expand Down Expand Up @@ -1719,7 +1719,7 @@ def __getitem__(

if isinstance(item, pl.Series):
dtype = item.dtype
if dtype == Utf8:
if dtype == String:
return self._from_pydf(self._df.select(item))
elif dtype.is_integer():
return self._take_with_series(item._pos_idxs(self.shape[0]))
Expand Down Expand Up @@ -2079,7 +2079,7 @@ def to_numpy(

Notes
-----
If you're attempting to convert Utf8 or Decimal to an array, you'll need to
If you're attempting to convert String or Decimal to an array, you'll need to
install `pyarrow`.

Examples
Expand Down Expand Up @@ -2123,7 +2123,7 @@ def to_numpy(
a = s.to_numpy(use_pyarrow=use_pyarrow)
arrays.append(
a.astype(str, copy=False)
if tp == Utf8 and not s.null_count()
if tp == String and not s.null_count()
else a
)

Expand Down Expand Up @@ -2309,15 +2309,15 @@ def to_init_repr(self, n: int = 1000) -> str:
... [
... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8),
... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32),
... pl.Series("ham", ["a", "b", "c"], dtype=pl.Utf8),
... pl.Series("ham", ["a", "b", "c"], dtype=pl.String),
... ]
... )
>>> print(df.to_init_repr())
pl.DataFrame(
[
pl.Series("foo", [1, 2, 3], dtype=pl.UInt8),
pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32),
pl.Series("ham", ['a', 'b', 'c'], dtype=pl.Utf8),
pl.Series("ham", ['a', 'b', 'c'], dtype=pl.String),
]
)

Expand Down Expand Up @@ -3848,7 +3848,7 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float:
... "y": [v / 1000 for v in range(1_000_000)],
... "z": [str(v) for v in range(1_000_000)],
... },
... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)],
... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)],
... )
>>> df.estimated_size()
25888898
Expand Down Expand Up @@ -4267,7 +4267,7 @@ def glimpse(
schema = self.schema

def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
fn = repr if schema[col_name] == Utf8 else str
fn = repr if schema[col_name] == String else str
values = self[:max_n_values][col_name].to_list()
val_str = ", ".join(fn(v) for v in values) # type: ignore[operator]
if len(col_name) > max_colname_length:
Expand Down Expand Up @@ -6727,15 +6727,15 @@ def cast(

Cast all frame columns to the specified dtype:

>>> df.cast(pl.Utf8).to_dict(as_series=False)
>>> df.cast(pl.String).to_dict(as_series=False)
{'foo': ['1', '2', '3'],
'bar': ['6.0', '7.0', '8.0'],
'ham': ['2020-01-02', '2021-03-04', '2022-05-06']}

Use selectors to define the columns being cast:

>>> import polars.selectors as cs
>>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8})
>>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String})
shape: (3, 3)
┌─────┬─────┬────────────┐
│ foo ┆ bar ┆ ham │
Expand Down Expand Up @@ -7089,7 +7089,7 @@ def explode(
----------
columns
Column names, expressions, or a selector defining them. The underlying
columns being exploded must be of List or Utf8 datatype.
columns being exploded must be of List or String datatype.
*more_columns
Additional names of columns to explode, specified as positional arguments.

Expand Down Expand Up @@ -9248,7 +9248,7 @@ def fold(self, operation: Callable[[Series, Series], Series]) -> Series:
An example of the supercast rules when applying an arithmetic operation on two
DataTypes are for instance:

- Int8 + Utf8 = Utf8
- Int8 + String = String
- Float32 + Int64 = Float32
- Float32 + Float64 = Float64

Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/datatypes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
List,
Null,
Object,
String,
Struct,
TemporalType,
Time,
Expand Down Expand Up @@ -97,6 +98,7 @@
"List",
"Null",
"Object",
"String",
"Struct",
"TemporalType",
"Time",
Expand Down
18 changes: 12 additions & 6 deletions py-polars/polars/datatypes/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,10 +376,14 @@ class Boolean(DataType):
"""Boolean type."""


class Utf8(DataType):
class String(DataType):
"""UTF-8 encoded string type."""


# Allow Utf8 as an alias for String
Utf8 = String
Comment on lines -379 to +384
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the core of this PR.



class Binary(DataType):
"""Binary type."""

Expand Down Expand Up @@ -745,15 +749,17 @@ def __init__(self, fields: Sequence[Field] | SchemaDict):
--------
Initialize using a dictionary:

>>> dtype = pl.Struct({"a": pl.Int8, "b": pl.List(pl.Utf8)})
>>> dtype = pl.Struct({"a": pl.Int8, "b": pl.List(pl.String)})
>>> dtype
Struct({'a': Int8, 'b': List(Utf8)})
Struct({'a': Int8, 'b': List(String)})

Initialize using a list of Field objects:

>>> dtype = pl.Struct([pl.Field("a", pl.Int8), pl.Field("b", pl.List(pl.Utf8))])
>>> dtype = pl.Struct(
... [pl.Field("a", pl.Int8), pl.Field("b", pl.List(pl.String))]
... )
>>> dtype
Struct({'a': Int8, 'b': List(Utf8)})
Struct({'a': Int8, 'b': List(String)})

When initializing a Series, Polars can infer a struct data type from the data.

Expand All @@ -766,7 +772,7 @@ def __init__(self, fields: Sequence[Field] | SchemaDict):
{2,["z"]}
]
>>> s.dtype
Struct({'a': Int64, 'b': List(Utf8)})
Struct({'a': Int64, 'b': List(String)})
"""
if isinstance(fields, Mapping):
self.fields = [Field(name, dtype) for name, dtype in fields.items()]
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/datatypes/constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
dt.Duration: PySeries.new_opt_i64,
dt.Time: PySeries.new_opt_i64,
dt.Boolean: PySeries.new_opt_bool,
dt.Utf8: PySeries.new_str,
dt.String: PySeries.new_str,
dt.Object: PySeries.new_object,
dt.Categorical: PySeries.new_str,
dt.Enum: PySeries.new_str,
Expand Down
18 changes: 9 additions & 9 deletions py-polars/polars/datatypes/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@
List,
Null,
Object,
String,
Struct,
Time,
UInt8,
UInt16,
UInt32,
UInt64,
Unknown,
Utf8,
)
from polars.dependencies import numpy as np
from polars.dependencies import pyarrow as pa
Expand All @@ -72,7 +72,7 @@
PY_STR_TO_DTYPE: SchemaDict = {
"float": Float64,
"int": Int64,
"str": Utf8,
"str": String,
"bool": Boolean,
"date": Date,
"datetime": Datetime("us"),
Expand All @@ -97,7 +97,7 @@ def _map_py_type_to_dtype(
if python_dtype is int:
return Int64
if python_dtype is str:
return Utf8
return String
if python_dtype is bool:
return Boolean
if issubclass(python_dtype, datetime):
Expand Down Expand Up @@ -172,16 +172,16 @@ def unpack_dtypes(
>>> struct_dtype = pl.Struct(
... [
... pl.Field("a", pl.Int64),
... pl.Field("b", pl.Utf8),
... pl.Field("b", pl.String),
... pl.Field("c", pl.List(pl.Float64)),
... ]
... )
>>> unpack_dtypes([struct_dtype, list_dtype]) # doctest: +IGNORE_RESULT
{Float64, Int64, Utf8}
{Float64, Int64, String}
>>> unpack_dtypes(
... [struct_dtype, list_dtype], include_compound=True
... ) # doctest: +IGNORE_RESULT
{Float64, Int64, Utf8, List(Float64), Struct([Field('a', Int64), Field('b', Utf8), Field('c', List(Float64))])}
{Float64, Int64, String, List(Float64), Struct([Field('a', Int64), Field('b', String), Field('c', List(Float64))])}

""" # noqa: W505
if not dtypes:
Expand Down Expand Up @@ -223,7 +223,7 @@ def DTYPE_TO_FFINAME(self) -> dict[PolarsDataType, str]:
Float64: "f64",
Decimal: "decimal",
Boolean: "bool",
Utf8: "str",
String: "str",
List: "list",
Date: "date",
Datetime: "datetime",
Expand Down Expand Up @@ -265,7 +265,7 @@ def DTYPE_TO_PY_TYPE(self) -> dict[PolarsDataType, PythonDataType]:
Int32: int,
Int16: int,
Int8: int,
Utf8: str,
String: str,
UInt8: int,
UInt16: int,
UInt32: int,
Expand Down Expand Up @@ -472,7 +472,7 @@ def numpy_char_code_to_dtype(dtype_char: str) -> PolarsDataType:
"""Convert a numpy character dtype to a Polars dtype."""
dtype = np.dtype(dtype_char)
if dtype.kind == "U":
return Utf8
return String
try:
return DataTypeMappings.NUMPY_KIND_AND_ITEMSIZE_TO_DTYPE[
(dtype.kind, dtype.itemsize)
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/expr/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def encode(self, encoding: TransferEncoding) -> Expr:
Returns
-------
Expr
Expression of data type :class:`Utf8` with values encoded using provided
Expression of data type :class:`String` with values encoded using provided
encoding.

Examples
Expand Down
Loading