feat(python): Rename Utf8 data type to String, keep Utf8 as ali…

…as (#13257)
pola-rs · Dec 27, 2023 · a75ad4c · a75ad4c
1 parent 1ca453b
commit a75ad4c
Show file tree

Hide file tree

Showing 115 changed files with 600 additions and 543 deletions.
diff --git a/docs/src/python/user-guide/expressions/casting.py b/docs/src/python/user-guide/expressions/casting.py
@@ -61,8 +61,8 @@
 )
 
 out = df.select(
-    pl.col("integers").cast(pl.Utf8),
-    pl.col("float").cast(pl.Utf8),
+    pl.col("integers").cast(pl.String),
+    pl.col("float").cast(pl.String),
     pl.col("floats_as_string").cast(pl.Float64),
 )
 print(out)

diff --git a/docs/src/python/user-guide/sql/intro.py b/docs/src/python/user-guide/sql/intro.py
@@ -65,8 +65,8 @@
 
 # --8<-- [start:execute_multiple_sources]
 # Input data:
-# products_masterdata.csv with schema {'product_id': Int64, 'product_name': Utf8}
-# products_categories.json with schema {'product_id': Int64, 'category': Utf8}
+# products_masterdata.csv with schema {'product_id': Int64, 'product_name': String}
+# products_categories.json with schema {'product_id': Int64, 'category': String}
 # sales_data is a Pandas DataFrame with schema {'product_id': Int64, 'sales': Int64}
 
 ctx = pl.SQLContext(

diff --git a/docs/user-guide/concepts/data-types/categoricals.md b/docs/user-guide/concepts/data-types/categoricals.md
@@ -269,7 +269,7 @@ Polars will raise an `OutOfBounds` error when a value is encountered which is no
 The following types of comparisons operators are allowed for categorical data:
 
 - Categorical vs Categorical
-- Categorical vs Utf8
+- Categorical vs String
 
 #### `Categorical` Type
 
@@ -282,7 +282,7 @@ For the `Categorical` type comparisons are valid if they have the same global ca
 --8<-- "python/user-guide/concepts/data-types/categoricals.py:global_equality"
 ```
 
-For `Categorical` vs `Utf8` comparisons Polars uses lexical ordering to determine the result:
+For `Categorical` vs `String` comparisons Polars uses lexical ordering to determine the result:
 
 {{code_block('user-guide/concepts/data-types/categoricals','str_compare_single',[])}}
 
@@ -306,7 +306,7 @@ For `Enum` type comparisons are valid if they have the same categories.
 --8<-- "python/user-guide/concepts/data-types/categoricals.py:equality"
 ```
 
-For `Enum` vs `Utf8` comparisons the order within the categories is used instead of lexical ordering. In order for a comparison to be valid all values in the `Utf8` column should be present in the `Enum` categories list.
+For `Enum` vs `String` comparisons the order within the categories is used instead of lexical ordering. In order for a comparison to be valid all values in the `String` column should be present in the `Enum` categories list.
 
 {{code_block('user-guide/concepts/data-types/categoricals','str_enum_compare_error',[])}}
 

diff --git a/docs/user-guide/concepts/data-types/overview.md b/docs/user-guide/concepts/data-types/overview.md
@@ -2,7 +2,7 @@
 
 Polars is entirely based on Arrow data types and backed by Arrow memory arrays. This makes data processing
 cache-efficient and well-supported for Inter Process Communication. Most data types follow the exact implementation
-from Arrow, with the exception of `Utf8` (this is actually `LargeUtf8`), `Categorical`, and `Object` (support is limited). The data types are:
+from Arrow, with the exception of `String` (this is actually `LargeUtf8`), `Categorical`, and `Object` (support is limited). The data types are:
 
 | Group    | Type          | Details                                                                                                                              |
 | -------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
@@ -23,7 +23,7 @@ from Arrow, with the exception of `Utf8` (this is actually `LargeUtf8`), `Catego
 |          | `Duration`    | A timedelta type, internally represented as microseconds. Created when subtracting `Date/Datetime`.                                  |
 |          | `Time`        | Time representation, internally represented as nanoseconds since midnight.                                                           |
 | Other    | `Boolean`     | Boolean type effectively bit packed.                                                                                                 |
-|          | `Utf8`        | String data (this is actually Arrow `LargeUtf8` internally).                                                                         |
+|          | `String`      | String data (this is actually Arrow `LargeUtf8` internally).                                                                         |
 |          | `Binary`      | Store data as bytes.                                                                                                                 |
 |          | `Object`      | A limited supported data type that can be any value.                                                                                 |
 |          | `Categorical` | A categorical encoding of a set of strings.                                                                                          |

diff --git a/docs/user-guide/expressions/casting.md b/docs/user-guide/expressions/casting.md
@@ -73,7 +73,7 @@ In case the column contains a non-numerical value, Polars will throw a `ComputeE
 
 ## Booleans
 
-Booleans can be expressed as either 1 (`True`) or 0 (`False`). It's possible to perform casting operations between a numerical `DataType` and a boolean, and vice versa. However, keep in mind that casting from a string (`Utf8`) to a boolean is not permitted.
+Booleans can be expressed as either 1 (`True`) or 0 (`False`). It's possible to perform casting operations between a numerical `DataType` and a boolean, and vice versa. However, keep in mind that casting from a string (`String`) to a boolean is not permitted.
 
 {{code_block('user-guide/expressions/casting','bool',['cast'])}}
 

diff --git a/docs/user-guide/expressions/plugins.md b/docs/user-guide/expressions/plugins.md
@@ -60,9 +60,9 @@ fn pig_latin_str(value: &str, output: &mut String) {
     }
 }
 
-#[polars_expr(output_type=Utf8)]
+#[polars_expr(output_type=String)]
 fn pig_latinnify(inputs: &[Series]) -> PolarsResult<Series> {
-    let ca = inputs[0].utf8()?;
+    let ca = inputs[0].str()?;
     let out: StringChunked = ca.apply_to_buffer(pig_latin_str);
     Ok(out.into_series())
 }
@@ -151,11 +151,11 @@ pub struct MyKwargs {
 /// If you want to accept `kwargs`. You define a `kwargs` argument
 /// on the second position in you plugin. You can provide any custom struct that is deserializable
 /// with the pickle protocol (on the Rust side).
-#[polars_expr(output_type=Utf8)]
+#[polars_expr(output_type=String)]
 fn append_kwargs(input: &[Series], kwargs: MyKwargs) -> PolarsResult<Series> {
     let input = &input[0];
-    let input = input.cast(&DataType::Utf8)?;
-    let ca = input.utf8().unwrap();
+    let input = input.cast(&DataType::String)?;
+    let ca = input.str().unwrap();
 
     Ok(ca
         .apply_to_buffer(|val, buf| {

diff --git a/docs/user-guide/expressions/strings.md b/docs/user-guide/expressions/strings.md
@@ -1,12 +1,12 @@
 # Strings
 
-The following section discusses operations performed on `Utf8` strings, which are a frequently used `DataType` when working with `DataFrames`. However, processing strings can often be inefficient due to their unpredictable memory size, causing the CPU to access many random memory locations. To address this issue, Polars utilizes Arrow as its backend, which stores all strings in a contiguous block of memory. As a result, string traversal is cache-optimal and predictable for the CPU.
+The following section discusses operations performed on `String` data, which is a frequently used `DataType` when working with `DataFrames`. However, processing strings can often be inefficient due to their unpredictable memory size, causing the CPU to access many random memory locations. To address this issue, Polars utilizes Arrow as its backend, which stores all strings in a contiguous block of memory. As a result, string traversal is cache-optimal and predictable for the CPU.
 
 String processing functions are available in the `str` namespace.
 
 ##### Accessing the string namespace
 
-The `str` namespace can be accessed through the `.str` attribute of a column with `Utf8` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `lengths` is recommended since it is faster.
+The `str` namespace can be accessed through the `.str` attribute of a column with `String` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `lengths` is recommended since it is faster.
 
 {{code_block('user-guide/expressions/strings','df',['str.len_bytes','str.len_chars'])}}
 

diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md
@@ -162,7 +162,7 @@ The mapping of Python types to Polars data types is as follows:
 - `int` -> `Int64`
 - `float` -> `Float64`
 - `bool` -> `Boolean`
-- `str` -> `Utf8`
+- `str` -> `String`
 - `list[tp]` -> `List[tp]` (where the inner type is inferred with the same rules)
 - `dict[str, [tp]]` -> `struct`
 - `Any` -> `object` (Prevent this at all times)
@@ -172,5 +172,5 @@ Rust types map as follows:
 - `i32` or `i64` -> `Int64`
 - `f32` or `f64` -> `Float64`
 - `bool` -> `Boolean`
-- `String` or `str` -> `Utf8`
+- `String` or `str` -> `String`
 - `Vec<tp>` -> `List[tp]` (where the inner type is inferred with the same rules)
diff --git a/py-polars/docs/source/reference/api.rst b/py-polars/docs/source/reference/api.rst
@@ -93,7 +93,7 @@ Examples
 
             pl.DataFrame(
                 data=["aaa", "bbb", "ccc", "ddd", "eee", "fff"],
-                columns=[("txt", pl.Utf8)],
+                columns=[("txt", pl.String)],
             ).split.by_alternate_rows()
 
             # [┌─────┐  ┌─────┐

diff --git a/py-polars/docs/source/reference/datatypes.rst b/py-polars/docs/source/reference/datatypes.rst
@@ -59,5 +59,6 @@ Other
     Enum
     Null
     Object
+    String
     Utf8
     Unknown
diff --git a/py-polars/docs/source/reference/selectors.rst b/py-polars/docs/source/reference/selectors.rst
@@ -62,7 +62,7 @@ Examples
             "JJK": pl.Date,
             "Lmn": pl.Duration,
             "opp": pl.Datetime("ms"),
-            "qqR": pl.Utf8,
+            "qqR": pl.String,
         },
     )
 
@@ -73,7 +73,7 @@ Examples
         "JJK": pl.Date,
         "Lmn": pl.Duration,
         "opp": pl.Datetime("ms"),
-        "qqR": pl.Utf8,
+        "qqR": pl.String,
     }
 
     # Select the INTERSECTION of temporal and column names that match "opp" OR "JJK"
@@ -98,7 +98,7 @@ Examples
         "fgg": pl.Boolean,
         "JJK": pl.Date,
         "opp": pl.Datetime("ms"),
-        "qqR": pl.Utf8,
+        "qqR": pl.String,
     }
 
 

diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py
@@ -53,6 +53,7 @@
     List,
     Null,
     Object,
+    String,
     Struct,
     Time,
     UInt8,
@@ -250,6 +251,7 @@
     "List",
     "Null",
     "Object",
+    "String",
     "Struct",
     "Time",
     "UInt16",

diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py
@@ -7,7 +7,7 @@
 
 import polars._reexport as pl
 from polars import functions as F
-from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, Struct, Utf8
+from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, String, Struct
 from polars.dependencies import pandas as pd
 from polars.dependencies import pyarrow as pa
 from polars.exceptions import NoDataError
@@ -152,7 +152,7 @@ def from_dicts(
     >>> pl.from_dicts(
     ...     data,
     ...     schema=["a", "b", "c", "d"],
-    ...     schema_overrides={"c": pl.Float64, "d": pl.Utf8},
+    ...     schema_overrides={"c": pl.Float64, "d": pl.String},
     ... )
     shape: (3, 4)
     ┌─────┬─────┬──────┬──────┐
@@ -286,15 +286,15 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame:
             if coldata:
                 coldata.pop(idx)
 
-    # init cols as utf8 Series, handle "null" -> None, create schema from repr dtype
+    # init cols as String Series, handle "null" -> None, create schema from repr dtype
     data = [
-        pl.Series([(None if v == "null" else v) for v in cd], dtype=Utf8)
+        pl.Series([(None if v == "null" else v) for v in cd], dtype=String)
         for cd in coldata
     ]
     schema = dict(zip(headers, (dtype_short_repr_to_dtype(d) for d in dtypes)))
     if schema and data and (n_extend_cols := (len(schema) - len(data))) > 0:
         empty_data = [None] * len(data[0])
-        data.extend((pl.Series(empty_data, dtype=Utf8)) for _ in range(n_extend_cols))
+        data.extend((pl.Series(empty_data, dtype=String)) for _ in range(n_extend_cols))
     for dtype in set(schema.values()):
         if dtype in (List, Struct, Object):
             raise NotImplementedError(
@@ -306,10 +306,10 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame:
     if no_dtypes:
         if df.is_empty():
             # if no dtypes *and* empty, default to string
-            return df.with_columns(F.all().cast(Utf8))
+            return df.with_columns(F.all().cast(String))
         else:
             # otherwise, take a trip through our CSV inference logic
-            if all(tp == Utf8 for tp in df.schema.values()):
+            if all(tp == String for tp in df.schema.values()):
                 buf = io.BytesIO()
                 df.write_csv(file=buf)
                 df = read_csv(buf, new_columns=df.columns, try_parse_dates=True)
@@ -347,10 +347,10 @@ def _from_series_repr(m: re.Match[str]) -> Series:
     if not values:
         return pl.Series(name=name, values=values, dtype=dtype)
     else:
-        srs = pl.Series(name=name, values=values, dtype=Utf8)
+        srs = pl.Series(name=name, values=values, dtype=String)
         if dtype is None:
             return srs
-        elif dtype in (Categorical, Utf8):
+        elif dtype in (Categorical, String):
             return srs.str.replace('^"(.*)"$', r"$1").cast(dtype)
 
         return _cast_repr_strings_with_schema(

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -43,8 +43,8 @@
     Float64,
     Null,
     Object,
+    String,
     Unknown,
-    Utf8,
     py_type_to_dtype,
 )
 from polars.dependencies import (
@@ -1228,7 +1228,7 @@ def dtypes(self) -> list[DataType]:
         ...     }
         ... )
         >>> df.dtypes
-        [Int64, Float64, Utf8]
+        [Int64, Float64, String]
         >>> df
         shape: (3, 3)
         ┌─────┬─────┬─────┐
@@ -1271,7 +1271,7 @@ def schema(self) -> OrderedDict[str, DataType]:
         ...     }
         ... )
         >>> df.schema
-        OrderedDict({'foo': Int64, 'bar': Float64, 'ham': Utf8})
+        OrderedDict({'foo': Int64, 'bar': Float64, 'ham': String})
 
         """
         return OrderedDict(zip(self.columns, self.dtypes))
@@ -1719,7 +1719,7 @@ def __getitem__(
 
         if isinstance(item, pl.Series):
             dtype = item.dtype
-            if dtype == Utf8:
+            if dtype == String:
                 return self._from_pydf(self._df.select(item))
             elif dtype.is_integer():
                 return self._take_with_series(item._pos_idxs(self.shape[0]))
@@ -2079,7 +2079,7 @@ def to_numpy(
 
         Notes
         -----
-        If you're attempting to convert Utf8 or Decimal to an array, you'll need to
+        If you're attempting to convert String or Decimal to an array, you'll need to
         install `pyarrow`.
 
         Examples
@@ -2123,7 +2123,7 @@ def to_numpy(
                 a = s.to_numpy(use_pyarrow=use_pyarrow)
                 arrays.append(
                     a.astype(str, copy=False)
-                    if tp == Utf8 and not s.null_count()
+                    if tp == String and not s.null_count()
                     else a
                 )
 
@@ -2309,15 +2309,15 @@ def to_init_repr(self, n: int = 1000) -> str:
         ...     [
         ...         pl.Series("foo", [1, 2, 3], dtype=pl.UInt8),
         ...         pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32),
-        ...         pl.Series("ham", ["a", "b", "c"], dtype=pl.Utf8),
+        ...         pl.Series("ham", ["a", "b", "c"], dtype=pl.String),
         ...     ]
         ... )
         >>> print(df.to_init_repr())
         pl.DataFrame(
             [
                 pl.Series("foo", [1, 2, 3], dtype=pl.UInt8),
                 pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32),
-                pl.Series("ham", ['a', 'b', 'c'], dtype=pl.Utf8),
+                pl.Series("ham", ['a', 'b', 'c'], dtype=pl.String),
             ]
         )
 
@@ -3848,7 +3848,7 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float:
         ...         "y": [v / 1000 for v in range(1_000_000)],
         ...         "z": [str(v) for v in range(1_000_000)],
         ...     },
-        ...     schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)],
+        ...     schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)],
         ... )
         >>> df.estimated_size()
         25888898
@@ -4267,7 +4267,7 @@ def glimpse(
         schema = self.schema
 
         def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
-            fn = repr if schema[col_name] == Utf8 else str
+            fn = repr if schema[col_name] == String else str
             values = self[:max_n_values][col_name].to_list()
             val_str = ", ".join(fn(v) for v in values)  # type: ignore[operator]
             if len(col_name) > max_colname_length:
@@ -6727,15 +6727,15 @@ def cast(
 
         Cast all frame columns to the specified dtype:
 
-        >>> df.cast(pl.Utf8).to_dict(as_series=False)
+        >>> df.cast(pl.String).to_dict(as_series=False)
         {'foo': ['1', '2', '3'],
          'bar': ['6.0', '7.0', '8.0'],
          'ham': ['2020-01-02', '2021-03-04', '2022-05-06']}
 
         Use selectors to define the columns being cast:
 
         >>> import polars.selectors as cs
-        >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8})
+        >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String})
         shape: (3, 3)
         ┌─────┬─────┬────────────┐
         │ foo ┆ bar ┆ ham        │
@@ -7089,7 +7089,7 @@ def explode(
         ----------
         columns
             Column names, expressions, or a selector defining them. The underlying
-            columns being exploded must be of List or Utf8 datatype.
+            columns being exploded must be of List or String datatype.
         *more_columns
             Additional names of columns to explode, specified as positional arguments.
 
@@ -9248,7 +9248,7 @@ def fold(self, operation: Callable[[Series, Series], Series]) -> Series:
         An example of the supercast rules when applying an arithmetic operation on two
         DataTypes are for instance:
 
-        - Int8 + Utf8 = Utf8
+        - Int8 + String = String
         - Float32 + Int64 = Float32
         - Float32 + Float64 = Float64
 

diff --git a/py-polars/polars/datatypes/__init__.py b/py-polars/polars/datatypes/__init__.py
@@ -22,6 +22,7 @@
     List,
     Null,
     Object,
+    String,
     Struct,
     TemporalType,
     Time,
@@ -97,6 +98,7 @@
     "List",
     "Null",
     "Object",
+    "String",
     "Struct",
     "TemporalType",
     "Time",