pola-rs · stinodego · Dec 27, 2023 · Dec 26, 2023 · Dec 26, 2023 · Dec 26, 2023
@@ -61,8 +61,8 @@
 )
 
 out = df.select(
-    pl.col("integers").cast(pl.Utf8),
-    pl.col("float").cast(pl.Utf8),
+    pl.col("integers").cast(pl.String),
+    pl.col("float").cast(pl.String),
     pl.col("floats_as_string").cast(pl.Float64),
 )
 print(out)

@@ -59,5 +59,6 @@ Other
     Enum
     Null
     Object
+    String
     Utf8
     Unknown
@@ -53,6 +53,7 @@
     List,
     Null,
     Object,
+    String,
     Struct,
     Time,
     UInt8,
@@ -250,6 +251,7 @@
     "List",
     "Null",
     "Object",
+    "String",
     "Struct",
     "Time",
     "UInt16",

@@ -7,7 +7,7 @@
 
 import polars._reexport as pl
 from polars import functions as F
-from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, Struct, Utf8
+from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, String, Struct
 from polars.dependencies import pandas as pd
 from polars.dependencies import pyarrow as pa
 from polars.exceptions import NoDataError
@@ -152,7 +152,7 @@ def from_dicts(
     >>> pl.from_dicts(
     ...     data,
     ...     schema=["a", "b", "c", "d"],
-    ...     schema_overrides={"c": pl.Float64, "d": pl.Utf8},
+    ...     schema_overrides={"c": pl.Float64, "d": pl.String},
     ... )
     shape: (3, 4)
     ┌─────┬─────┬──────┬──────┐
@@ -286,15 +286,15 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame:
             if coldata:
                 coldata.pop(idx)
 
-    # init cols as utf8 Series, handle "null" -> None, create schema from repr dtype
+    # init cols as String Series, handle "null" -> None, create schema from repr dtype
     data = [
-        pl.Series([(None if v == "null" else v) for v in cd], dtype=Utf8)
+        pl.Series([(None if v == "null" else v) for v in cd], dtype=String)
         for cd in coldata
     ]
     schema = dict(zip(headers, (dtype_short_repr_to_dtype(d) for d in dtypes)))
     if schema and data and (n_extend_cols := (len(schema) - len(data))) > 0:
         empty_data = [None] * len(data[0])
-        data.extend((pl.Series(empty_data, dtype=Utf8)) for _ in range(n_extend_cols))
+        data.extend((pl.Series(empty_data, dtype=String)) for _ in range(n_extend_cols))
     for dtype in set(schema.values()):
         if dtype in (List, Struct, Object):
             raise NotImplementedError(
@@ -306,10 +306,10 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame:
     if no_dtypes:
         if df.is_empty():
             # if no dtypes *and* empty, default to string
-            return df.with_columns(F.all().cast(Utf8))
+            return df.with_columns(F.all().cast(String))
         else:
             # otherwise, take a trip through our CSV inference logic
-            if all(tp == Utf8 for tp in df.schema.values()):
+            if all(tp == String for tp in df.schema.values()):
                 buf = io.BytesIO()
                 df.write_csv(file=buf)
                 df = read_csv(buf, new_columns=df.columns, try_parse_dates=True)
@@ -347,10 +347,10 @@ def _from_series_repr(m: re.Match[str]) -> Series:
     if not values:
         return pl.Series(name=name, values=values, dtype=dtype)
     else:
-        srs = pl.Series(name=name, values=values, dtype=Utf8)
+        srs = pl.Series(name=name, values=values, dtype=String)
         if dtype is None:
             return srs
-        elif dtype in (Categorical, Utf8):
+        elif dtype in (Categorical, String):
             return srs.str.replace('^"(.*)"$', r"$1").cast(dtype)
 
         return _cast_repr_strings_with_schema(

@@ -43,8 +43,8 @@
     Float64,
     Null,
     Object,
+    String,
     Unknown,
-    Utf8,
     py_type_to_dtype,
 )
 from polars.dependencies import (
@@ -1228,7 +1228,7 @@ def dtypes(self) -> list[DataType]:
         ...     }
         ... )
         >>> df.dtypes
-        [Int64, Float64, Utf8]
+        [Int64, Float64, String]
         >>> df
         shape: (3, 3)
         ┌─────┬─────┬─────┐
@@ -1271,7 +1271,7 @@ def schema(self) -> OrderedDict[str, DataType]:
         ...     }
         ... )
         >>> df.schema
-        OrderedDict({'foo': Int64, 'bar': Float64, 'ham': Utf8})
+        OrderedDict({'foo': Int64, 'bar': Float64, 'ham': String})
 
         """
         return OrderedDict(zip(self.columns, self.dtypes))
@@ -1719,7 +1719,7 @@ def __getitem__(
 
         if isinstance(item, pl.Series):
             dtype = item.dtype
-            if dtype == Utf8:
+            if dtype == String:
                 return self._from_pydf(self._df.select(item))
             elif dtype.is_integer():
                 return self._take_with_series(item._pos_idxs(self.shape[0]))
@@ -2079,7 +2079,7 @@ def to_numpy(
 
         Notes
         -----
-        If you're attempting to convert Utf8 or Decimal to an array, you'll need to
+        If you're attempting to convert String or Decimal to an array, you'll need to
         install `pyarrow`.
 
         Examples
@@ -2123,7 +2123,7 @@ def to_numpy(
                 a = s.to_numpy(use_pyarrow=use_pyarrow)
                 arrays.append(
                     a.astype(str, copy=False)
-                    if tp == Utf8 and not s.null_count()
+                    if tp == String and not s.null_count()
                     else a
                 )
 
@@ -2309,15 +2309,15 @@ def to_init_repr(self, n: int = 1000) -> str:
         ...     [
         ...         pl.Series("foo", [1, 2, 3], dtype=pl.UInt8),
         ...         pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32),
-        ...         pl.Series("ham", ["a", "b", "c"], dtype=pl.Utf8),
+        ...         pl.Series("ham", ["a", "b", "c"], dtype=pl.String),
         ...     ]
         ... )
         >>> print(df.to_init_repr())
         pl.DataFrame(
             [
                 pl.Series("foo", [1, 2, 3], dtype=pl.UInt8),
                 pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32),
-                pl.Series("ham", ['a', 'b', 'c'], dtype=pl.Utf8),
+                pl.Series("ham", ['a', 'b', 'c'], dtype=pl.String),
             ]
         )
 
@@ -3848,7 +3848,7 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float:
         ...         "y": [v / 1000 for v in range(1_000_000)],
         ...         "z": [str(v) for v in range(1_000_000)],
         ...     },
-        ...     schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)],
+        ...     schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)],
         ... )
         >>> df.estimated_size()
         25888898
@@ -4267,7 +4267,7 @@ def glimpse(
         schema = self.schema
 
         def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
-            fn = repr if schema[col_name] == Utf8 else str
+            fn = repr if schema[col_name] == String else str
             values = self[:max_n_values][col_name].to_list()
             val_str = ", ".join(fn(v) for v in values)  # type: ignore[operator]
             if len(col_name) > max_colname_length:
@@ -6727,15 +6727,15 @@ def cast(
 
         Cast all frame columns to the specified dtype:
 
-        >>> df.cast(pl.Utf8).to_dict(as_series=False)
+        >>> df.cast(pl.String).to_dict(as_series=False)
         {'foo': ['1', '2', '3'],
          'bar': ['6.0', '7.0', '8.0'],
          'ham': ['2020-01-02', '2021-03-04', '2022-05-06']}
 
         Use selectors to define the columns being cast:
 
         >>> import polars.selectors as cs
-        >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8})
+        >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String})
         shape: (3, 3)
         ┌─────┬─────┬────────────┐
         │ foo ┆ bar ┆ ham        │
@@ -7089,7 +7089,7 @@ def explode(
         ----------
         columns
             Column names, expressions, or a selector defining them. The underlying
-            columns being exploded must be of List or Utf8 datatype.
+            columns being exploded must be of List or String datatype.
         *more_columns
             Additional names of columns to explode, specified as positional arguments.
 
@@ -9248,7 +9248,7 @@ def fold(self, operation: Callable[[Series, Series], Series]) -> Series:
         An example of the supercast rules when applying an arithmetic operation on two
         DataTypes are for instance:
 
-        - Int8 + Utf8 = Utf8
+        - Int8 + String = String
         - Float32 + Int64 = Float32
         - Float32 + Float64 = Float64
 

@@ -22,6 +22,7 @@
     List,
     Null,
     Object,
+    String,
     Struct,
     TemporalType,
     Time,
@@ -97,6 +98,7 @@
     "List",
     "Null",
     "Object",
+    "String",
     "Struct",
     "TemporalType",
     "Time",

@@ -376,10 +376,14 @@ class Boolean(DataType):
     """Boolean type."""
 
 
-class Utf8(DataType):
+class String(DataType):
     """UTF-8 encoded string type."""
 
 
+# Allow Utf8 as an alias for String
+Utf8 = String
+
+
 class Binary(DataType):
     """Binary type."""
 
@@ -745,15 +749,17 @@ def __init__(self, fields: Sequence[Field] | SchemaDict):
         --------
         Initialize using a dictionary:
 
-        >>> dtype = pl.Struct({"a": pl.Int8, "b": pl.List(pl.Utf8)})
+        >>> dtype = pl.Struct({"a": pl.Int8, "b": pl.List(pl.String)})
         >>> dtype
-        Struct({'a': Int8, 'b': List(Utf8)})
+        Struct({'a': Int8, 'b': List(String)})
 
         Initialize using a list of Field objects:
 
-        >>> dtype = pl.Struct([pl.Field("a", pl.Int8), pl.Field("b", pl.List(pl.Utf8))])
+        >>> dtype = pl.Struct(
+        ...     [pl.Field("a", pl.Int8), pl.Field("b", pl.List(pl.String))]
+        ... )
         >>> dtype
-        Struct({'a': Int8, 'b': List(Utf8)})
+        Struct({'a': Int8, 'b': List(String)})
 
         When initializing a Series, Polars can infer a struct data type from the data.
 
@@ -766,7 +772,7 @@ def __init__(self, fields: Sequence[Field] | SchemaDict):
                 {2,["z"]}
         ]
         >>> s.dtype
-        Struct({'a': Int64, 'b': List(Utf8)})
+        Struct({'a': Int64, 'b': List(String)})
         """
         if isinstance(fields, Mapping):
             self.fields = [Field(name, dtype) for name, dtype in fields.items()]

@@ -38,7 +38,7 @@
         dt.Duration: PySeries.new_opt_i64,
         dt.Time: PySeries.new_opt_i64,
         dt.Boolean: PySeries.new_opt_bool,
-        dt.Utf8: PySeries.new_str,
+        dt.String: PySeries.new_str,
         dt.Object: PySeries.new_object,
         dt.Categorical: PySeries.new_str,
         dt.Enum: PySeries.new_str,

@@ -39,14 +39,14 @@
     List,
     Null,
     Object,
+    String,
     Struct,
     Time,
     UInt8,
     UInt16,
     UInt32,
     UInt64,
     Unknown,
-    Utf8,
 )
 from polars.dependencies import numpy as np
 from polars.dependencies import pyarrow as pa
@@ -72,7 +72,7 @@
 PY_STR_TO_DTYPE: SchemaDict = {
     "float": Float64,
     "int": Int64,
-    "str": Utf8,
+    "str": String,
     "bool": Boolean,
     "date": Date,
     "datetime": Datetime("us"),
@@ -97,7 +97,7 @@ def _map_py_type_to_dtype(
     if python_dtype is int:
         return Int64
     if python_dtype is str:
-        return Utf8
+        return String
     if python_dtype is bool:
         return Boolean
     if issubclass(python_dtype, datetime):
@@ -172,16 +172,16 @@ def unpack_dtypes(
     >>> struct_dtype = pl.Struct(
     ...     [
     ...         pl.Field("a", pl.Int64),
-    ...         pl.Field("b", pl.Utf8),
+    ...         pl.Field("b", pl.String),
     ...         pl.Field("c", pl.List(pl.Float64)),
     ...     ]
     ... )
     >>> unpack_dtypes([struct_dtype, list_dtype])  # doctest: +IGNORE_RESULT
-    {Float64, Int64, Utf8}
+    {Float64, Int64, String}
     >>> unpack_dtypes(
     ...     [struct_dtype, list_dtype], include_compound=True
     ... )  # doctest: +IGNORE_RESULT
-    {Float64, Int64, Utf8, List(Float64), Struct([Field('a', Int64), Field('b', Utf8), Field('c', List(Float64))])}
+    {Float64, Int64, String, List(Float64), Struct([Field('a', Int64), Field('b', String), Field('c', List(Float64))])}
 
     """  # noqa: W505
     if not dtypes:
@@ -223,7 +223,7 @@ def DTYPE_TO_FFINAME(self) -> dict[PolarsDataType, str]:
             Float64: "f64",
             Decimal: "decimal",
             Boolean: "bool",
-            Utf8: "str",
+            String: "str",
             List: "list",
             Date: "date",
             Datetime: "datetime",
@@ -265,7 +265,7 @@ def DTYPE_TO_PY_TYPE(self) -> dict[PolarsDataType, PythonDataType]:
             Int32: int,
             Int16: int,
             Int8: int,
-            Utf8: str,
+            String: str,
             UInt8: int,
             UInt16: int,
             UInt32: int,
@@ -472,7 +472,7 @@ def numpy_char_code_to_dtype(dtype_char: str) -> PolarsDataType:
     """Convert a numpy character dtype to a Polars dtype."""
     dtype = np.dtype(dtype_char)
     if dtype.kind == "U":
-        return Utf8
+        return String
     try:
         return DataTypeMappings.NUMPY_KIND_AND_ITEMSIZE_TO_DTYPE[
             (dtype.kind, dtype.itemsize)

@@ -195,7 +195,7 @@ def encode(self, encoding: TransferEncoding) -> Expr:
         Returns
         -------
         Expr
-            Expression of data type :class:`Utf8` with values encoded using provided
+            Expression of data type :class:`String` with values encoded using provided
             encoding.
 
         Examples