diff --git a/docs/src/python/user-guide/expressions/casting.py b/docs/src/python/user-guide/expressions/casting.py index 5f248937743e..bd06f4038843 100644 --- a/docs/src/python/user-guide/expressions/casting.py +++ b/docs/src/python/user-guide/expressions/casting.py @@ -61,8 +61,8 @@ ) out = df.select( - pl.col("integers").cast(pl.Utf8), - pl.col("float").cast(pl.Utf8), + pl.col("integers").cast(pl.String), + pl.col("float").cast(pl.String), pl.col("floats_as_string").cast(pl.Float64), ) print(out) diff --git a/docs/src/python/user-guide/sql/intro.py b/docs/src/python/user-guide/sql/intro.py index 525b5e5cc6f6..1d334d6e8777 100644 --- a/docs/src/python/user-guide/sql/intro.py +++ b/docs/src/python/user-guide/sql/intro.py @@ -65,8 +65,8 @@ # --8<-- [start:execute_multiple_sources] # Input data: -# products_masterdata.csv with schema {'product_id': Int64, 'product_name': Utf8} -# products_categories.json with schema {'product_id': Int64, 'category': Utf8} +# products_masterdata.csv with schema {'product_id': Int64, 'product_name': String} +# products_categories.json with schema {'product_id': Int64, 'category': String} # sales_data is a Pandas DataFrame with schema {'product_id': Int64, 'sales': Int64} ctx = pl.SQLContext( diff --git a/docs/user-guide/concepts/data-types/categoricals.md b/docs/user-guide/concepts/data-types/categoricals.md index 5c5d2570aacb..d240fd0c5fc0 100644 --- a/docs/user-guide/concepts/data-types/categoricals.md +++ b/docs/user-guide/concepts/data-types/categoricals.md @@ -269,7 +269,7 @@ Polars will raise an `OutOfBounds` error when a value is encountered which is no The following types of comparisons operators are allowed for categorical data: - Categorical vs Categorical -- Categorical vs Utf8 +- Categorical vs String #### `Categorical` Type @@ -282,7 +282,7 @@ For the `Categorical` type comparisons are valid if they have the same global ca --8<-- "python/user-guide/concepts/data-types/categoricals.py:global_equality" ``` -For `Categorical` vs `Utf8` comparisons Polars uses lexical ordering to determine the result: +For `Categorical` vs `String` comparisons Polars uses lexical ordering to determine the result: {{code_block('user-guide/concepts/data-types/categoricals','str_compare_single',[])}} @@ -306,7 +306,7 @@ For `Enum` type comparisons are valid if they have the same categories. --8<-- "python/user-guide/concepts/data-types/categoricals.py:equality" ``` -For `Enum` vs `Utf8` comparisons the order within the categories is used instead of lexical ordering. In order for a comparison to be valid all values in the `Utf8` column should be present in the `Enum` categories list. +For `Enum` vs `String` comparisons the order within the categories is used instead of lexical ordering. In order for a comparison to be valid all values in the `String` column should be present in the `Enum` categories list. {{code_block('user-guide/concepts/data-types/categoricals','str_enum_compare_error',[])}} diff --git a/docs/user-guide/concepts/data-types/overview.md b/docs/user-guide/concepts/data-types/overview.md index e25a35b676b5..30e7073bccc5 100644 --- a/docs/user-guide/concepts/data-types/overview.md +++ b/docs/user-guide/concepts/data-types/overview.md @@ -2,7 +2,7 @@ Polars is entirely based on Arrow data types and backed by Arrow memory arrays. This makes data processing cache-efficient and well-supported for Inter Process Communication. Most data types follow the exact implementation -from Arrow, with the exception of `Utf8` (this is actually `LargeUtf8`), `Categorical`, and `Object` (support is limited). The data types are: +from Arrow, with the exception of `String` (this is actually `LargeUtf8`), `Categorical`, and `Object` (support is limited). The data types are: | Group | Type | Details | | -------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------ | @@ -23,7 +23,7 @@ from Arrow, with the exception of `Utf8` (this is actually `LargeUtf8`), `Catego | | `Duration` | A timedelta type, internally represented as microseconds. Created when subtracting `Date/Datetime`. | | | `Time` | Time representation, internally represented as nanoseconds since midnight. | | Other | `Boolean` | Boolean type effectively bit packed. | -| | `Utf8` | String data (this is actually Arrow `LargeUtf8` internally). | +| | `String` | String data (this is actually Arrow `LargeUtf8` internally). | | | `Binary` | Store data as bytes. | | | `Object` | A limited supported data type that can be any value. | | | `Categorical` | A categorical encoding of a set of strings. | diff --git a/docs/user-guide/expressions/casting.md b/docs/user-guide/expressions/casting.md index b239bff1b460..6deddaecb684 100644 --- a/docs/user-guide/expressions/casting.md +++ b/docs/user-guide/expressions/casting.md @@ -73,7 +73,7 @@ In case the column contains a non-numerical value, Polars will throw a `ComputeE ## Booleans -Booleans can be expressed as either 1 (`True`) or 0 (`False`). It's possible to perform casting operations between a numerical `DataType` and a boolean, and vice versa. However, keep in mind that casting from a string (`Utf8`) to a boolean is not permitted. +Booleans can be expressed as either 1 (`True`) or 0 (`False`). It's possible to perform casting operations between a numerical `DataType` and a boolean, and vice versa. However, keep in mind that casting from a string (`String`) to a boolean is not permitted. {{code_block('user-guide/expressions/casting','bool',['cast'])}} diff --git a/docs/user-guide/expressions/plugins.md b/docs/user-guide/expressions/plugins.md index 18cb25856d37..727e7f1acb07 100644 --- a/docs/user-guide/expressions/plugins.md +++ b/docs/user-guide/expressions/plugins.md @@ -60,9 +60,9 @@ fn pig_latin_str(value: &str, output: &mut String) { } } -#[polars_expr(output_type=Utf8)] +#[polars_expr(output_type=String)] fn pig_latinnify(inputs: &[Series]) -> PolarsResult { - let ca = inputs[0].utf8()?; + let ca = inputs[0].str()?; let out: StringChunked = ca.apply_to_buffer(pig_latin_str); Ok(out.into_series()) } @@ -151,11 +151,11 @@ pub struct MyKwargs { /// If you want to accept `kwargs`. You define a `kwargs` argument /// on the second position in you plugin. You can provide any custom struct that is deserializable /// with the pickle protocol (on the Rust side). -#[polars_expr(output_type=Utf8)] +#[polars_expr(output_type=String)] fn append_kwargs(input: &[Series], kwargs: MyKwargs) -> PolarsResult { let input = &input[0]; - let input = input.cast(&DataType::Utf8)?; - let ca = input.utf8().unwrap(); + let input = input.cast(&DataType::String)?; + let ca = input.str().unwrap(); Ok(ca .apply_to_buffer(|val, buf| { diff --git a/docs/user-guide/expressions/strings.md b/docs/user-guide/expressions/strings.md index 72315547b0e0..4b47088faac0 100644 --- a/docs/user-guide/expressions/strings.md +++ b/docs/user-guide/expressions/strings.md @@ -1,12 +1,12 @@ # Strings -The following section discusses operations performed on `Utf8` strings, which are a frequently used `DataType` when working with `DataFrames`. However, processing strings can often be inefficient due to their unpredictable memory size, causing the CPU to access many random memory locations. To address this issue, Polars utilizes Arrow as its backend, which stores all strings in a contiguous block of memory. As a result, string traversal is cache-optimal and predictable for the CPU. +The following section discusses operations performed on `String` data, which is a frequently used `DataType` when working with `DataFrames`. However, processing strings can often be inefficient due to their unpredictable memory size, causing the CPU to access many random memory locations. To address this issue, Polars utilizes Arrow as its backend, which stores all strings in a contiguous block of memory. As a result, string traversal is cache-optimal and predictable for the CPU. String processing functions are available in the `str` namespace. ##### Accessing the string namespace -The `str` namespace can be accessed through the `.str` attribute of a column with `Utf8` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `lengths` is recommended since it is faster. +The `str` namespace can be accessed through the `.str` attribute of a column with `String` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `lengths` is recommended since it is faster. {{code_block('user-guide/expressions/strings','df',['str.len_bytes','str.len_chars'])}} diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 3d508a4225da..e24468d60337 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -162,7 +162,7 @@ The mapping of Python types to Polars data types is as follows: - `int` -> `Int64` - `float` -> `Float64` - `bool` -> `Boolean` -- `str` -> `Utf8` +- `str` -> `String` - `list[tp]` -> `List[tp]` (where the inner type is inferred with the same rules) - `dict[str, [tp]]` -> `struct` - `Any` -> `object` (Prevent this at all times) @@ -172,5 +172,5 @@ Rust types map as follows: - `i32` or `i64` -> `Int64` - `f32` or `f64` -> `Float64` - `bool` -> `Boolean` -- `String` or `str` -> `Utf8` +- `String` or `str` -> `String` - `Vec` -> `List[tp]` (where the inner type is inferred with the same rules) diff --git a/py-polars/docs/source/reference/api.rst b/py-polars/docs/source/reference/api.rst index 33ae1f27ac92..26c708ea1fac 100644 --- a/py-polars/docs/source/reference/api.rst +++ b/py-polars/docs/source/reference/api.rst @@ -93,7 +93,7 @@ Examples pl.DataFrame( data=["aaa", "bbb", "ccc", "ddd", "eee", "fff"], - columns=[("txt", pl.Utf8)], + columns=[("txt", pl.String)], ).split.by_alternate_rows() # [┌─────┐ ┌─────┐ diff --git a/py-polars/docs/source/reference/datatypes.rst b/py-polars/docs/source/reference/datatypes.rst index 196bcc41a1f2..3e538998b002 100644 --- a/py-polars/docs/source/reference/datatypes.rst +++ b/py-polars/docs/source/reference/datatypes.rst @@ -59,5 +59,6 @@ Other Enum Null Object + String Utf8 Unknown diff --git a/py-polars/docs/source/reference/selectors.rst b/py-polars/docs/source/reference/selectors.rst index 064cd530f968..0c4ad7ec2641 100644 --- a/py-polars/docs/source/reference/selectors.rst +++ b/py-polars/docs/source/reference/selectors.rst @@ -62,7 +62,7 @@ Examples "JJK": pl.Date, "Lmn": pl.Duration, "opp": pl.Datetime("ms"), - "qqR": pl.Utf8, + "qqR": pl.String, }, ) @@ -73,7 +73,7 @@ Examples "JJK": pl.Date, "Lmn": pl.Duration, "opp": pl.Datetime("ms"), - "qqR": pl.Utf8, + "qqR": pl.String, } # Select the INTERSECTION of temporal and column names that match "opp" OR "JJK" @@ -98,7 +98,7 @@ Examples "fgg": pl.Boolean, "JJK": pl.Date, "opp": pl.Datetime("ms"), - "qqR": pl.Utf8, + "qqR": pl.String, } diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index a367ad7701a0..459169a4aea9 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -53,6 +53,7 @@ List, Null, Object, + String, Struct, Time, UInt8, @@ -250,6 +251,7 @@ "List", "Null", "Object", + "String", "Struct", "Time", "UInt16", diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py index dba571d59d17..0014329b5a36 100644 --- a/py-polars/polars/convert.py +++ b/py-polars/polars/convert.py @@ -7,7 +7,7 @@ import polars._reexport as pl from polars import functions as F -from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, Struct, Utf8 +from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, String, Struct from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa from polars.exceptions import NoDataError @@ -152,7 +152,7 @@ def from_dicts( >>> pl.from_dicts( ... data, ... schema=["a", "b", "c", "d"], - ... schema_overrides={"c": pl.Float64, "d": pl.Utf8}, + ... schema_overrides={"c": pl.Float64, "d": pl.String}, ... ) shape: (3, 4) ┌─────┬─────┬──────┬──────┐ @@ -286,15 +286,15 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame: if coldata: coldata.pop(idx) - # init cols as utf8 Series, handle "null" -> None, create schema from repr dtype + # init cols as String Series, handle "null" -> None, create schema from repr dtype data = [ - pl.Series([(None if v == "null" else v) for v in cd], dtype=Utf8) + pl.Series([(None if v == "null" else v) for v in cd], dtype=String) for cd in coldata ] schema = dict(zip(headers, (dtype_short_repr_to_dtype(d) for d in dtypes))) if schema and data and (n_extend_cols := (len(schema) - len(data))) > 0: empty_data = [None] * len(data[0]) - data.extend((pl.Series(empty_data, dtype=Utf8)) for _ in range(n_extend_cols)) + data.extend((pl.Series(empty_data, dtype=String)) for _ in range(n_extend_cols)) for dtype in set(schema.values()): if dtype in (List, Struct, Object): raise NotImplementedError( @@ -306,10 +306,10 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame: if no_dtypes: if df.is_empty(): # if no dtypes *and* empty, default to string - return df.with_columns(F.all().cast(Utf8)) + return df.with_columns(F.all().cast(String)) else: # otherwise, take a trip through our CSV inference logic - if all(tp == Utf8 for tp in df.schema.values()): + if all(tp == String for tp in df.schema.values()): buf = io.BytesIO() df.write_csv(file=buf) df = read_csv(buf, new_columns=df.columns, try_parse_dates=True) @@ -347,10 +347,10 @@ def _from_series_repr(m: re.Match[str]) -> Series: if not values: return pl.Series(name=name, values=values, dtype=dtype) else: - srs = pl.Series(name=name, values=values, dtype=Utf8) + srs = pl.Series(name=name, values=values, dtype=String) if dtype is None: return srs - elif dtype in (Categorical, Utf8): + elif dtype in (Categorical, String): return srs.str.replace('^"(.*)"$', r"$1").cast(dtype) return _cast_repr_strings_with_schema( diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 4a00bdeb66a8..36fe2fc06faf 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -43,8 +43,8 @@ Float64, Null, Object, + String, Unknown, - Utf8, py_type_to_dtype, ) from polars.dependencies import ( @@ -1228,7 +1228,7 @@ def dtypes(self) -> list[DataType]: ... } ... ) >>> df.dtypes - [Int64, Float64, Utf8] + [Int64, Float64, String] >>> df shape: (3, 3) ┌─────┬─────┬─────┐ @@ -1271,7 +1271,7 @@ def schema(self) -> OrderedDict[str, DataType]: ... } ... ) >>> df.schema - OrderedDict({'foo': Int64, 'bar': Float64, 'ham': Utf8}) + OrderedDict({'foo': Int64, 'bar': Float64, 'ham': String}) """ return OrderedDict(zip(self.columns, self.dtypes)) @@ -1719,7 +1719,7 @@ def __getitem__( if isinstance(item, pl.Series): dtype = item.dtype - if dtype == Utf8: + if dtype == String: return self._from_pydf(self._df.select(item)) elif dtype.is_integer(): return self._take_with_series(item._pos_idxs(self.shape[0])) @@ -2079,7 +2079,7 @@ def to_numpy( Notes ----- - If you're attempting to convert Utf8 or Decimal to an array, you'll need to + If you're attempting to convert String or Decimal to an array, you'll need to install `pyarrow`. Examples @@ -2123,7 +2123,7 @@ def to_numpy( a = s.to_numpy(use_pyarrow=use_pyarrow) arrays.append( a.astype(str, copy=False) - if tp == Utf8 and not s.null_count() + if tp == String and not s.null_count() else a ) @@ -2309,7 +2309,7 @@ def to_init_repr(self, n: int = 1000) -> str: ... [ ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Utf8), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.String), ... ] ... ) >>> print(df.to_init_repr()) @@ -2317,7 +2317,7 @@ def to_init_repr(self, n: int = 1000) -> str: [ pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - pl.Series("ham", ['a', 'b', 'c'], dtype=pl.Utf8), + pl.Series("ham", ['a', 'b', 'c'], dtype=pl.String), ] ) @@ -3848,7 +3848,7 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float: ... "y": [v / 1000 for v in range(1_000_000)], ... "z": [str(v) for v in range(1_000_000)], ... }, - ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)], ... ) >>> df.estimated_size() 25888898 @@ -4267,7 +4267,7 @@ def glimpse( schema = self.schema def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]: - fn = repr if schema[col_name] == Utf8 else str + fn = repr if schema[col_name] == String else str values = self[:max_n_values][col_name].to_list() val_str = ", ".join(fn(v) for v in values) # type: ignore[operator] if len(col_name) > max_colname_length: @@ -6727,7 +6727,7 @@ def cast( Cast all frame columns to the specified dtype: - >>> df.cast(pl.Utf8).to_dict(as_series=False) + >>> df.cast(pl.String).to_dict(as_series=False) {'foo': ['1', '2', '3'], 'bar': ['6.0', '7.0', '8.0'], 'ham': ['2020-01-02', '2021-03-04', '2022-05-06']} @@ -6735,7 +6735,7 @@ def cast( Use selectors to define the columns being cast: >>> import polars.selectors as cs - >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}) shape: (3, 3) ┌─────┬─────┬────────────┐ │ foo ┆ bar ┆ ham │ @@ -7089,7 +7089,7 @@ def explode( ---------- columns Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. + columns being exploded must be of List or String datatype. *more_columns Additional names of columns to explode, specified as positional arguments. @@ -9248,7 +9248,7 @@ def fold(self, operation: Callable[[Series, Series], Series]) -> Series: An example of the supercast rules when applying an arithmetic operation on two DataTypes are for instance: - - Int8 + Utf8 = Utf8 + - Int8 + String = String - Float32 + Int64 = Float32 - Float32 + Float64 = Float64 diff --git a/py-polars/polars/datatypes/__init__.py b/py-polars/polars/datatypes/__init__.py index 631d072e267d..2c9ed7bc0251 100644 --- a/py-polars/polars/datatypes/__init__.py +++ b/py-polars/polars/datatypes/__init__.py @@ -22,6 +22,7 @@ List, Null, Object, + String, Struct, TemporalType, Time, @@ -97,6 +98,7 @@ "List", "Null", "Object", + "String", "Struct", "TemporalType", "Time", diff --git a/py-polars/polars/datatypes/classes.py b/py-polars/polars/datatypes/classes.py index b6bf0d03117b..39647be83202 100644 --- a/py-polars/polars/datatypes/classes.py +++ b/py-polars/polars/datatypes/classes.py @@ -376,10 +376,14 @@ class Boolean(DataType): """Boolean type.""" -class Utf8(DataType): +class String(DataType): """UTF-8 encoded string type.""" +# Allow Utf8 as an alias for String +Utf8 = String + + class Binary(DataType): """Binary type.""" @@ -745,15 +749,17 @@ def __init__(self, fields: Sequence[Field] | SchemaDict): -------- Initialize using a dictionary: - >>> dtype = pl.Struct({"a": pl.Int8, "b": pl.List(pl.Utf8)}) + >>> dtype = pl.Struct({"a": pl.Int8, "b": pl.List(pl.String)}) >>> dtype - Struct({'a': Int8, 'b': List(Utf8)}) + Struct({'a': Int8, 'b': List(String)}) Initialize using a list of Field objects: - >>> dtype = pl.Struct([pl.Field("a", pl.Int8), pl.Field("b", pl.List(pl.Utf8))]) + >>> dtype = pl.Struct( + ... [pl.Field("a", pl.Int8), pl.Field("b", pl.List(pl.String))] + ... ) >>> dtype - Struct({'a': Int8, 'b': List(Utf8)}) + Struct({'a': Int8, 'b': List(String)}) When initializing a Series, Polars can infer a struct data type from the data. @@ -766,7 +772,7 @@ def __init__(self, fields: Sequence[Field] | SchemaDict): {2,["z"]} ] >>> s.dtype - Struct({'a': Int64, 'b': List(Utf8)}) + Struct({'a': Int64, 'b': List(String)}) """ if isinstance(fields, Mapping): self.fields = [Field(name, dtype) for name, dtype in fields.items()] diff --git a/py-polars/polars/datatypes/constructor.py b/py-polars/polars/datatypes/constructor.py index 7e322b0071c4..14c79b7d7acf 100644 --- a/py-polars/polars/datatypes/constructor.py +++ b/py-polars/polars/datatypes/constructor.py @@ -38,7 +38,7 @@ dt.Duration: PySeries.new_opt_i64, dt.Time: PySeries.new_opt_i64, dt.Boolean: PySeries.new_opt_bool, - dt.Utf8: PySeries.new_str, + dt.String: PySeries.new_str, dt.Object: PySeries.new_object, dt.Categorical: PySeries.new_str, dt.Enum: PySeries.new_str, diff --git a/py-polars/polars/datatypes/convert.py b/py-polars/polars/datatypes/convert.py index 73bb6a15f33a..6507e6835386 100644 --- a/py-polars/polars/datatypes/convert.py +++ b/py-polars/polars/datatypes/convert.py @@ -39,6 +39,7 @@ List, Null, Object, + String, Struct, Time, UInt8, @@ -46,7 +47,6 @@ UInt32, UInt64, Unknown, - Utf8, ) from polars.dependencies import numpy as np from polars.dependencies import pyarrow as pa @@ -72,7 +72,7 @@ PY_STR_TO_DTYPE: SchemaDict = { "float": Float64, "int": Int64, - "str": Utf8, + "str": String, "bool": Boolean, "date": Date, "datetime": Datetime("us"), @@ -97,7 +97,7 @@ def _map_py_type_to_dtype( if python_dtype is int: return Int64 if python_dtype is str: - return Utf8 + return String if python_dtype is bool: return Boolean if issubclass(python_dtype, datetime): @@ -172,16 +172,16 @@ def unpack_dtypes( >>> struct_dtype = pl.Struct( ... [ ... pl.Field("a", pl.Int64), - ... pl.Field("b", pl.Utf8), + ... pl.Field("b", pl.String), ... pl.Field("c", pl.List(pl.Float64)), ... ] ... ) >>> unpack_dtypes([struct_dtype, list_dtype]) # doctest: +IGNORE_RESULT - {Float64, Int64, Utf8} + {Float64, Int64, String} >>> unpack_dtypes( ... [struct_dtype, list_dtype], include_compound=True ... ) # doctest: +IGNORE_RESULT - {Float64, Int64, Utf8, List(Float64), Struct([Field('a', Int64), Field('b', Utf8), Field('c', List(Float64))])} + {Float64, Int64, String, List(Float64), Struct([Field('a', Int64), Field('b', String), Field('c', List(Float64))])} """ # noqa: W505 if not dtypes: @@ -223,7 +223,7 @@ def DTYPE_TO_FFINAME(self) -> dict[PolarsDataType, str]: Float64: "f64", Decimal: "decimal", Boolean: "bool", - Utf8: "str", + String: "str", List: "list", Date: "date", Datetime: "datetime", @@ -265,7 +265,7 @@ def DTYPE_TO_PY_TYPE(self) -> dict[PolarsDataType, PythonDataType]: Int32: int, Int16: int, Int8: int, - Utf8: str, + String: str, UInt8: int, UInt16: int, UInt32: int, @@ -472,7 +472,7 @@ def numpy_char_code_to_dtype(dtype_char: str) -> PolarsDataType: """Convert a numpy character dtype to a Polars dtype.""" dtype = np.dtype(dtype_char) if dtype.kind == "U": - return Utf8 + return String try: return DataTypeMappings.NUMPY_KIND_AND_ITEMSIZE_TO_DTYPE[ (dtype.kind, dtype.itemsize) diff --git a/py-polars/polars/expr/binary.py b/py-polars/polars/expr/binary.py index b3fbcbdf850d..37eecd5eb150 100644 --- a/py-polars/polars/expr/binary.py +++ b/py-polars/polars/expr/binary.py @@ -195,7 +195,7 @@ def encode(self, encoding: TransferEncoding) -> Expr: Returns ------- Expr - Expression of data type :class:`Utf8` with values encoded using provided + Expression of data type :class:`String` with values encoded using provided encoding. Examples diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py index 980f3c0bfa6f..df75ef933c7b 100644 --- a/py-polars/polars/expr/datetime.py +++ b/py-polars/polars/expr/datetime.py @@ -393,9 +393,9 @@ def combine(self, time: dt.time | Expr, time_unit: TimeUnit = "us") -> Expr: def to_string(self, format: str) -> Expr: """ - Convert a Date/Time/Datetime column into a Utf8 column with the given format. + Convert a Date/Time/Datetime column into a String column with the given format. - Similar to `cast(pl.Utf8)`, but this method allows you to customize the + Similar to `cast(pl.String)`, but this method allows you to customize the formatting of the resulting string. Parameters @@ -438,9 +438,9 @@ def to_string(self, format: str) -> Expr: def strftime(self, format: str) -> Expr: """ - Convert a Date/Time/Datetime column into a Utf8 column with the given format. + Convert a Date/Time/Datetime column into a String column with the given format. - Similar to `cast(pl.Utf8)`, but this method allows you to customize the + Similar to `cast(pl.String)`, but this method allows you to customize the formatting of the resulting string. Alias for :func:`to_string`. diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 9c0f6d6331d3..2fba1518f1b1 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -574,7 +574,7 @@ def join(self, separator: IntoExpr) -> Expr: """ Join all string items in a sublist and place a separator between them. - This errors if inner type of list `!= Utf8`. + This errors if inner type of list `!= String`. Parameters ---------- @@ -584,7 +584,7 @@ def join(self, separator: IntoExpr) -> Expr: Returns ------- Expr - Expression of data type :class:`Utf8`. + Expression of data type :class:`String`. Examples -------- diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 335f8ac5a1bb..fab17a7f17f1 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -46,7 +46,7 @@ def to_date( cache: bool = True, ) -> Expr: """ - Convert a Utf8 column into a Date column. + Convert a String column into a Date column. Parameters ---------- @@ -96,7 +96,7 @@ def to_datetime( ambiguous: Ambiguous | Expr = "raise", ) -> Expr: """ - Convert a Utf8 column into a Datetime column. + Convert a String column into a Datetime column. Parameters ---------- @@ -174,7 +174,7 @@ def to_time( cache: bool = True, ) -> Expr: """ - Convert a Utf8 column into a Time column. + Convert a String column into a Time column. Parameters ---------- @@ -216,7 +216,7 @@ def strptime( ambiguous: Ambiguous | Expr = "raise", ) -> Expr: """ - Convert a Utf8 column into a Date/Datetime/Time column. + Convert a String column into a Date/Datetime/Time column. Parameters ---------- @@ -326,7 +326,7 @@ def to_decimal( inference_length: int = 100, ) -> Expr: """ - Convert a Utf8 column into a Decimal column. + Convert a String column into a Decimal column. This method infers the needed parameters `precision` and `scale`. @@ -469,7 +469,7 @@ def concat(self, delimiter: str = "-", *, ignore_nulls: bool = True) -> Expr: Returns ------- Expr - Expression of data type :class:`Utf8`. + Expression of data type :class:`String`. Examples -------- @@ -951,7 +951,7 @@ def zfill(self, length: int) -> Expr: Examples -------- >>> df = pl.DataFrame({"a": [-1, 123, 999999, None]}) - >>> df.with_columns(zfill=pl.col("a").cast(pl.Utf8).str.zfill(4)) + >>> df.with_columns(zfill=pl.col("a").cast(pl.String).str.zfill(4)) shape: (4, 2) ┌────────┬────────┐ │ a ┆ zfill │ @@ -1215,7 +1215,7 @@ def json_path_match(self, json_path: str) -> Expr: Extract the first match of JSON string with the provided JSONPath expression. Throws errors if invalid JSON strings are encountered. - All return values will be cast to :class:`Utf8` regardless of the original + All return values will be cast to :class:`String` regardless of the original value. Documentation on JSONPath standard can be found @@ -1229,7 +1229,7 @@ def json_path_match(self, json_path: str) -> Expr: Returns ------- Expr - Expression of data type :class:`Utf8`. Contains null values if original + Expression of data type :class:`String`. Contains null values if original value is null or the json_path returns nothing. Examples @@ -1288,7 +1288,7 @@ def encode(self, encoding: TransferEncoding) -> Expr: Returns ------- Expr - Expression of data type :class:`Utf8`. + Expression of data type :class:`String`. Examples -------- @@ -1365,7 +1365,7 @@ def extract(self, pattern: str, group_index: int = 1) -> Expr: Returns ------- Expr - Expression of data type :class:`Utf8`. Contains null values if original + Expression of data type :class:`String`. Contains null values if original value is null or the regex captures nothing. Examples @@ -1460,7 +1460,7 @@ def extract_all(self, pattern: str | Expr) -> Expr: Returns ------- Expr - Expression of data type `List(Utf8)`. + Expression of data type `List(String)`. Examples -------- @@ -1522,7 +1522,7 @@ def extract_groups(self, pattern: str) -> Expr: ------- Expr Expression of data type :class:`Struct` with fields of data type - :class:`Utf8`. + :class:`String`. Examples -------- @@ -1682,7 +1682,7 @@ def split(self, by: IntoExpr, *, inclusive: bool = False) -> Expr: Returns ------- Expr - Expression of data type :class:`Utf8`. + Expression of data type :class:`String`. """ by = parse_as_expression(by, str_as_lit=True) @@ -1711,7 +1711,7 @@ def split_exact(self, by: IntoExpr, n: int, *, inclusive: bool = False) -> Expr: ------- Expr Expression of data type :class:`Struct` with fields of data type - :class:`Utf8`. + :class:`String`. Examples -------- @@ -1780,7 +1780,7 @@ def splitn(self, by: IntoExpr, n: int) -> Expr: ------- Expr Expression of data type :class:`Struct` with fields of data type - :class:`Utf8`. + :class:`String`. Examples -------- @@ -1967,7 +1967,7 @@ def reverse(self) -> Expr: def slice(self, offset: int, length: int | None = None) -> Expr: """ - Create subslices of the string values of a Utf8 Series. + Create subslices of the string values of a String Series. Parameters ---------- @@ -1980,7 +1980,7 @@ def slice(self, offset: int, length: int | None = None) -> Expr: Returns ------- Expr - Expression of data type :class:`Utf8`. + Expression of data type :class:`String`. Examples -------- @@ -2027,7 +2027,7 @@ def explode(self) -> Expr: Returns ------- Expr - Expression of data type :class:`Utf8`. + Expression of data type :class:`String`. Examples -------- @@ -2052,7 +2052,7 @@ def explode(self) -> Expr: def to_integer(self, *, base: int = 10, strict: bool = True) -> Expr: """ - Convert an Utf8 column into an Int64 column with base radix. + Convert an String column into an Int64 column with base radix. Parameters ---------- diff --git a/py-polars/polars/functions/as_datatype.py b/py-polars/polars/functions/as_datatype.py index 94c1b07b1152..f832c4e9d6b0 100644 --- a/py-polars/polars/functions/as_datatype.py +++ b/py-polars/polars/functions/as_datatype.py @@ -492,7 +492,7 @@ def concat_str( exprs Columns to concatenate into a single string column. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as - literals. Non-`Utf8` columns are cast to `Utf8`. + literals. Non-`String` columns are cast to `String`. *more_exprs Additional columns to concatenate into a single string column, specified as positional arguments. diff --git a/py-polars/polars/functions/col.py b/py-polars/polars/functions/col.py index 6b289f0a49a7..538165e6ce53 100644 --- a/py-polars/polars/functions/col.py +++ b/py-polars/polars/functions/col.py @@ -262,7 +262,7 @@ def __new__( # type: ignore[misc] Easily select all columns that match a certain data type by passing that datatype. - >>> df.select(pl.col(pl.Utf8)) + >>> df.select(pl.col(pl.String)) shape: (2, 1) ┌─────┐ │ bar │ diff --git a/py-polars/polars/interchange/utils.py b/py-polars/polars/interchange/utils.py index f15f50c00e67..7460621207f2 100644 --- a/py-polars/polars/interchange/utils.py +++ b/py-polars/polars/interchange/utils.py @@ -14,12 +14,12 @@ Int16, Int32, Int64, + String, Time, UInt8, UInt16, UInt32, UInt64, - Utf8, ) from polars.interchange.protocol import DtypeKind, Endianness @@ -42,7 +42,7 @@ Float32: (DtypeKind.FLOAT, 32, "f", NE), Float64: (DtypeKind.FLOAT, 64, "g", NE), Boolean: (DtypeKind.BOOL, 1, "b", NE), - Utf8: (DtypeKind.STRING, 8, "U", NE), + String: (DtypeKind.STRING, 8, "U", NE), Date: (DtypeKind.DATETIME, 32, "tdD", NE), Time: (DtypeKind.DATETIME, 64, "ttu", NE), Datetime: (DtypeKind.DATETIME, 64, "tsu:", NE), diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index e30db192ed12..6ab377d10ac4 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Mapping, Sequence, TextIO import polars._reexport as pl -from polars.datatypes import N_INFER_DEFAULT, Utf8 +from polars.datatypes import N_INFER_DEFAULT, String from polars.io._utils import _prepare_file_arg from polars.io.csv._utils import _check_arg_is_1byte, _update_columns from polars.io.csv.batched_reader import BatchedCsvReader @@ -107,12 +107,12 @@ def read_csv( Before using this option, try to increase the number of lines used for schema inference with e.g `infer_schema_length=10000` or override automatic dtype inference for specific columns with the `dtypes` option or use - `infer_schema_length=0` to read all columns as `pl.Utf8` to check which + `infer_schema_length=0` to read all columns as `pl.String` to check which values might cause an issue. try_parse_dates Try to automatically parse dates. Most ISO8601-like formats can be inferred, as well as a handful of others. If this does not succeed, - the column remains of data type `pl.Utf8`. + the column remains of data type `pl.String`. If `use_pyarrow=True`, dates will always be parsed. n_threads Number of threads to use in csv parsing. @@ -122,7 +122,7 @@ def read_csv( If schema is inferred wrongly (e.g. as `pl.Int64` instead of `pl.Float64`), try to increase the number of lines used to infer the schema or override inferred dtype for those columns with `dtypes`. - If set to 0, all columns will be read as `pl.Utf8`. + If set to 0, all columns will be read as `pl.String`. If set to `None`, a full table scan will be done (slow). batch_size Number of lines to read into the buffer at once. @@ -279,7 +279,7 @@ def read_csv( # Fix list of dtypes when used together with projection as polars CSV reader # wants a list of dtypes for the x first columns before it does the projection. - dtypes_list: list[PolarsDataType] = [Utf8] * (max(projection) + 1) + dtypes_list: list[PolarsDataType] = [String] * (max(projection) + 1) for idx, column_idx in enumerate(projection): if idx < len(dtypes): @@ -484,17 +484,17 @@ def read_csv_batched( ignore_errors Try to keep reading lines if some lines yield errors. First try `infer_schema_length=0` to read all columns as - `pl.Utf8` to check which values might cause an issue. + `pl.String` to check which values might cause an issue. try_parse_dates Try to automatically parse dates. Most ISO8601-like formats can be inferred, as well as a handful of others. If this does not succeed, - the column remains of data type `pl.Utf8`. + the column remains of data type `pl.String`. n_threads Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system. infer_schema_length Maximum number of lines to read to infer schema. - If set to 0, all columns will be read as `pl.Utf8`. + If set to 0, all columns will be read as `pl.String`. If set to `None`, a full table scan will be done (slow). batch_size Number of lines to read into the buffer at once. @@ -590,7 +590,7 @@ def read_csv_batched( # Fix list of dtypes when used together with projection as polars CSV reader # wants a list of dtypes for the x first columns before it does the projection. - dtypes_list: list[PolarsDataType] = [Utf8] * (max(projection) + 1) + dtypes_list: list[PolarsDataType] = [String] * (max(projection) + 1) for idx, column_idx in enumerate(projection): if idx < len(dtypes): @@ -778,7 +778,7 @@ def scan_csv( ignore_errors Try to keep reading lines if some lines yield errors. First try `infer_schema_length=0` to read all columns as - `pl.Utf8` to check which values might cause an issue. + `pl.String` to check which values might cause an issue. cache Cache the result after reading. with_column_names @@ -786,7 +786,7 @@ def scan_csv( this function will receive (and should return) a list of column names. infer_schema_length Maximum number of lines to read to infer schema. - If set to 0, all columns will be read as `pl.Utf8`. + If set to 0, all columns will be read as `pl.String`. If set to `None`, a full table scan will be done (slow). n_rows Stop reading from CSV file after reading `n_rows`. @@ -807,7 +807,7 @@ def scan_csv( try_parse_dates Try to automatically parse dates. Most ISO8601-like formats can be inferred, as well as a handful of others. If this does not succeed, - the column remains of data type `pl.Utf8`. + the column remains of data type `pl.String`. eol_char Single byte end of line character (default: `\n`). When encountering a file with windows line endings (`\r\n`), one can go with the default `\n`. The extra @@ -874,7 +874,7 @@ def scan_csv( >>> pl.scan_csv( ... path, ... new_columns=["idx", "txt"], - ... dtypes=[pl.UInt16, pl.Utf8], + ... dtypes=[pl.UInt16, pl.String], ... ).collect() shape: (4, 2) ┌─────┬──────┐ diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index df413147d3f2..966d9519219e 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -38,12 +38,12 @@ Int16, Int32, Int64, + String, Time, UInt8, UInt16, UInt32, UInt64, - Utf8, py_type_to_dtype, ) from polars.dependencies import dataframe_api_compat, subprocess @@ -666,7 +666,7 @@ def dtypes(self) -> list[DataType]: ... } ... ) >>> lf.dtypes - [Int64, Float64, Utf8] + [Int64, Float64, String] """ return self._ldf.dtypes() @@ -686,7 +686,7 @@ def schema(self) -> OrderedDict[str, DataType]: ... } ... ) >>> lf.schema - OrderedDict({'foo': Int64, 'bar': Float64, 'ham': Utf8}) + OrderedDict({'foo': Int64, 'bar': Float64, 'ham': String}) """ return OrderedDict(self._ldf.schema()) @@ -2463,7 +2463,7 @@ def cast( Cast all frame columns to the specified dtype: - >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + >>> lf.cast(pl.String).collect().to_dict(as_series=False) {'foo': ['1', '2', '3'], 'bar': ['6.0', '7.0', '8.0'], 'ham': ['2020-01-02', '2021-03-04', '2022-05-06']} @@ -2471,7 +2471,7 @@ def cast( Use selectors to define the columns being cast: >>> import polars.selectors as cs - >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}).collect() shape: (3, 3) ┌─────┬─────┬────────────┐ │ foo ┆ bar ┆ ham │ @@ -4813,7 +4813,7 @@ def infer_dtype(value: Any) -> PolarsDataType: elif isinstance(value, time): dtypes = [Time] elif isinstance(value, str): - dtypes = [Utf8, Categorical] + dtypes = [String, Categorical] else: # fallback; anything not explicitly handled above dtypes = [infer_dtype(F.lit(value))] @@ -5144,7 +5144,7 @@ def explode( ---------- columns Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. + columns being exploded must be of List or String datatype. *more_columns Additional names of columns to explode, specified as positional arguments. diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index abc0246be970..1de49f6bd59b 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -20,8 +20,8 @@ Decimal, Duration, Object, + String, Time, - Utf8, is_polars_dtype, ) from polars.expr import Expr @@ -334,7 +334,7 @@ def all() -> SelectorType: Select all columns, casting them to string: - >>> df.select(cs.all().cast(pl.Utf8)) + >>> df.select(cs.all().cast(pl.String)) shape: (2, 2) ┌────────────┬─────────┐ │ dt ┆ value │ @@ -1796,7 +1796,7 @@ def starts_with(*prefix: str) -> SelectorType: @deprecate_nonkeyword_arguments(version="0.19.3") def string(include_categorical: bool = False) -> SelectorType: # noqa: FBT001 """ - Select all Utf8 (and, optionally, Categorical) string columns . + Select all String (and, optionally, Categorical) string columns . See Also -------- @@ -1848,7 +1848,7 @@ def string(include_categorical: bool = False) -> SelectorType: # noqa: FBT001 └─────┴─────┴─────┴──────┘ """ - string_dtypes: list[PolarsDataType] = [Utf8] + string_dtypes: list[PolarsDataType] = [String] if include_categorical: string_dtypes.append(Categorical) diff --git a/py-polars/polars/series/datetime.py b/py-polars/polars/series/datetime.py index 19d588298023..59c4f0537317 100644 --- a/py-polars/polars/series/datetime.py +++ b/py-polars/polars/series/datetime.py @@ -113,9 +113,9 @@ def mean(self) -> dt.date | dt.datetime | None: def to_string(self, format: str) -> Series: """ - Convert a Date/Time/Datetime column into a Utf8 column with the given format. + Convert a Date/Time/Datetime column into a String column with the given format. - Similar to `cast(pl.Utf8)`, but this method allows you to customize the + Similar to `cast(pl.String)`, but this method allows you to customize the formatting of the resulting string. Parameters @@ -145,9 +145,9 @@ def to_string(self, format: str) -> Series: def strftime(self, format: str) -> Series: """ - Convert a Date/Time/Datetime column into a Utf8 column with the given format. + Convert a Date/Time/Datetime column into a String column with the given format. - Similar to `cast(pl.Utf8)`, but this method allows you to customize the + Similar to `cast(pl.String)`, but this method allows you to customize the formatting of the resulting string. Alias for :func:`to_string`. diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index 136bc7cf64e9..22808628e0d0 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -285,7 +285,7 @@ def join(self, separator: IntoExpr) -> Series: """ Join all string items in a sublist and place a separator between them. - This errors if inner type of list `!= Utf8`. + This errors if inner type of list `!= String`. Parameters ---------- @@ -295,7 +295,7 @@ def join(self, separator: IntoExpr) -> Series: Returns ------- Series - Series of data type :class:`Utf8`. + Series of data type :class:`String`. Examples -------- diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 78fceb312ebd..1df6f102f014 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -39,11 +39,11 @@ List, Null, Object, + String, Time, UInt32, UInt64, Unknown, - Utf8, dtype_to_ctype, is_polars_dtype, maybe_cast, @@ -1149,7 +1149,7 @@ def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]: Ensures that `np.asarray(pl.Series(..))` works as expected, see https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. """ - if not dtype and self.dtype == Utf8 and not self.null_count(): + if not dtype and self.dtype == String and not self.null_count(): dtype = np.dtype("U") if dtype: return self.to_numpy().__array__(dtype) @@ -1711,7 +1711,7 @@ def describe( "null_count": self.null_count(), "sum": self.sum(), } - elif self.dtype == Utf8: + elif self.dtype == String: stats_dtype = Int64 stats = { "count": self.count(), @@ -1721,7 +1721,7 @@ def describe( elif self.dtype.is_temporal(): # we coerce all to string, because a polars column # only has a single dtype and dates: datetime and count: int don't match - stats_dtype = Utf8 + stats_dtype = String stats = { "count": str(self.count()), "null_count": str(self.null_count()), @@ -1734,7 +1734,7 @@ def describe( return pl.DataFrame( {"statistic": stats.keys(), "value": stats.values()}, - schema={"statistic": Utf8, "value": stats_dtype}, + schema={"statistic": String, "value": stats_dtype}, ) def sum(self) -> int | float: @@ -7153,13 +7153,13 @@ def is_boolean(self) -> bool: """ return self.dtype == Boolean - @deprecate_function("Use `Series.dtype == pl.Utf8` instead.", version="0.19.14") + @deprecate_function("Use `Series.dtype == pl.String` instead.", version="0.19.14") def is_utf8(self) -> bool: """ - Check if this Series datatype is a Utf8. + Check if this Series datatype is a String. .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Utf8` instead. + Use `Series.dtype == pl.String` instead. Examples -------- @@ -7168,7 +7168,7 @@ def is_utf8(self) -> bool: True """ - return self.dtype == Utf8 + return self.dtype == String @deprecate_renamed_function("gather_every", version="0.19.14") def take_every(self, n: int, offset: int = 0) -> Series: diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index eaae9fd16d03..473bf3cb172f 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -40,7 +40,7 @@ def to_date( cache: bool = True, ) -> Series: """ - Convert a Utf8 column into a Date column. + Convert a String column into a Date column. Parameters ---------- @@ -89,7 +89,7 @@ def to_datetime( ambiguous: Ambiguous | Series = "raise", ) -> Series: """ - Convert a Utf8 column into a Datetime column. + Convert a String column into a Datetime column. Parameters ---------- @@ -161,7 +161,7 @@ def to_time( cache: bool = True, ) -> Series: """ - Convert a Utf8 column into a Time column. + Convert a String column into a Time column. Parameters ---------- @@ -201,7 +201,7 @@ def strptime( ambiguous: Ambiguous | Series = "raise", ) -> Series: """ - Convert a Utf8 column into a Date/Datetime/Time column. + Convert a String column into a Date/Datetime/Time column. Parameters ---------- @@ -292,7 +292,7 @@ def to_decimal( inference_length: int = 100, ) -> Series: """ - Convert a Utf8 column into a Decimal column. + Convert a String column into a Decimal column. This method infers the needed parameters `precision` and `scale`. @@ -407,7 +407,7 @@ def concat(self, delimiter: str = "-", *, ignore_nulls: bool = True) -> Series: Returns ------- Series - Series of data type :class:`Utf8`. + Series of data type :class:`String`. Examples -------- @@ -575,7 +575,7 @@ def encode(self, encoding: TransferEncoding) -> Series: Returns ------- Series - Series of data type :class:`Utf8`. + Series of data type :class:`String`. Examples -------- @@ -632,7 +632,7 @@ def json_path_match(self, json_path: str) -> Series: Extract the first match of json string with provided JSONPath expression. Throw errors if encounter invalid json strings. - All return value will be casted to Utf8 regardless of the original value. + All return value will be casted to String regardless of the original value. Documentation on JSONPath standard can be found `here `_. @@ -645,7 +645,7 @@ def json_path_match(self, json_path: str) -> Series: Returns ------- Series - Series of data type :class:`Utf8`. Contains null values if the original + Series of data type :class:`String`. Contains null values if the original value is null or the json_path returns nothing. Examples @@ -683,7 +683,7 @@ def extract(self, pattern: str, group_index: int = 1) -> Series: Returns ------- Series - Series of data type :class:`Utf8`. Contains null values if the original + Series of data type :class:`String`. Contains null values if the original value is null or regex captures nothing. Notes @@ -784,7 +784,7 @@ def extract_all(self, pattern: str | Series) -> Series: Returns ------- Series - Series of data type `List(Utf8)`. + Series of data type `List(String)`. Examples -------- @@ -834,7 +834,8 @@ def extract_groups(self, pattern: str) -> Series: Returns ------- Series - Series of data type :class:`Struct` with fields of data type :class:`Utf8`. + Series of data type :class:`Struct` with fields of data type + :class:`String`. Examples -------- @@ -917,7 +918,7 @@ def split(self, by: IntoExpr, *, inclusive: bool = False) -> Series: Returns ------- Series - Series of data type `List(Utf8)`. + Series of data type `List(String)`. """ @@ -977,7 +978,8 @@ def split_exact(self, by: IntoExpr, n: int, *, inclusive: bool = False) -> Serie Returns ------- Series - Series of data type :class:`Struct` with fields of data type :class:`Utf8`. + Series of data type :class:`Struct` with fields of data type + :class:`String`. """ @@ -1035,7 +1037,8 @@ def splitn(self, by: IntoExpr, n: int) -> Series: Returns ------- Series - Series of data type :class:`Struct` with fields of data type :class:`Utf8`. + Series of data type :class:`Struct` with fields of data type + :class:`String`. """ @@ -1378,7 +1381,7 @@ def zfill(self, length: int) -> Series: Examples -------- >>> s = pl.Series([-1, 123, 999999, None]) - >>> s.cast(pl.Utf8).str.zfill(4) + >>> s.cast(pl.String).str.zfill(4) shape: (4,) Series: '' [str] [ @@ -1460,7 +1463,7 @@ def reverse(self) -> Series: def slice(self, offset: int, length: int | None = None) -> Series: """ - Create subslices of the string values of a Utf8 Series. + Create subslices of the string values of a String Series. Parameters ---------- @@ -1473,7 +1476,8 @@ def slice(self, offset: int, length: int | None = None) -> Series: Returns ------- Series - Series of data type :class:`Struct` with fields of data type :class:`Utf8`. + Series of data type :class:`Struct` with fields of data type + :class:`String`. Examples -------- @@ -1509,7 +1513,7 @@ def explode(self) -> Series: Returns ------- Series - Series of data type :class:`Utf8`. + Series of data type :class:`String`. Examples -------- @@ -1530,7 +1534,7 @@ def explode(self) -> Series: def to_integer(self, *, base: int = 10, strict: bool = True) -> Series: """ - Convert an Utf8 column into an Int64 column with base radix. + Convert an String column into an Int64 column with base radix. Parameters ---------- diff --git a/py-polars/polars/testing/asserts/series.py b/py-polars/polars/testing/asserts/series.py index 84fcccf14c39..ae8cb8d672a4 100644 --- a/py-polars/polars/testing/asserts/series.py +++ b/py-polars/polars/testing/asserts/series.py @@ -9,8 +9,8 @@ Decimal, Float64, List, + String, Struct, - Utf8, unpack_dtypes, ) from polars.exceptions import ComputeError @@ -126,9 +126,9 @@ def _assert_series_values_equal( # Handle categoricals if categorical_as_str: if left.dtype == Categorical: - left = left.cast(Utf8) + left = left.cast(String) if right.dtype == Categorical: - right = right.cast(Utf8) + right = right.cast(String) # Handle decimals # TODO: Delete this branch when Decimal equality is implemented diff --git a/py-polars/polars/testing/parametric/primitives.py b/py-polars/polars/testing/parametric/primitives.py index 6c36d46a5b81..ad9c94bc8365 100644 --- a/py-polars/polars/testing/parametric/primitives.py +++ b/py-polars/polars/testing/parametric/primitives.py @@ -98,7 +98,7 @@ class column: >>> column(name="unique_small_ints", dtype=pl.UInt8, unique=True) column(name='unique_small_ints', dtype=UInt8, strategy=None, null_probability=None, unique=True) >>> column(name="ccy", strategy=sampled_from(["GBP", "EUR", "JPY"])) - column(name='ccy', dtype=Utf8, strategy=sampled_from(['GBP', 'EUR', 'JPY']), null_probability=None, unique=False) + column(name='ccy', dtype=String, strategy=sampled_from(['GBP', 'EUR', 'JPY']), null_probability=None, unique=False) """ # noqa: W505 @@ -332,7 +332,7 @@ def series( >>> from polars.testing.parametric import create_list_strategy >>> s = series( ... strategy=create_list_strategy( - ... inner_dtype=pl.Utf8, + ... inner_dtype=pl.String, ... select_from=["xx", "yy", "zz"], ... ), ... min_size=2, diff --git a/py-polars/polars/testing/parametric/strategies.py b/py-polars/polars/testing/parametric/strategies.py index 164fffd9edd1..71f66dd7c21f 100644 --- a/py-polars/polars/testing/parametric/strategies.py +++ b/py-polars/polars/testing/parametric/strategies.py @@ -50,12 +50,12 @@ Int32, Int64, List, + String, Time, UInt8, UInt16, UInt32, UInt64, - Utf8, is_polars_dtype, ) from polars.type_aliases import PolarsDataType @@ -94,7 +94,7 @@ def between(draw: DrawFn, type_: type, min_: Any, max_: Any) -> Any: strategy_u64 = integers(min_value=0, max_value=(2**64) - 1) strategy_categorical = text(max_size=2, alphabet=ascii_uppercase) -strategy_utf8 = text( +strategy_string = text( alphabet=characters(max_codepoint=1000, exclude_categories=["Cs", "Cc"]), max_size=8, ) @@ -273,7 +273,7 @@ def update(self, items: StrategyLookup) -> Self: # type: ignore[override] Duration("ms"): strategy_duration, Duration: strategy_duration, Categorical: strategy_categorical, - Utf8: strategy_utf8, + String: strategy_string, Binary: strategy_binary, } ) @@ -295,7 +295,7 @@ def _get_strategy_dtypes( Parameters ---------- base_type - If True, return the base types for each dtype (eg:`List(Utf8)` → `List`). + If True, return the base types for each dtype (eg:`List(String)` → `List`). excluding A dtype or sequence of dtypes to omit from the results. @@ -359,7 +359,7 @@ def create_list_strategy( Create a strategy that generates lists of lists of specific strings: >>> lst = create_list_strategy( - ... inner_dtype=pl.List(pl.Utf8), + ... inner_dtype=pl.List(pl.String), ... select_from=["xx", "yy", "zz"], ... ) >>> lst.example() # doctest: +SKIP diff --git a/py-polars/polars/utils/_construction.py b/py-polars/polars/utils/_construction.py index e3e31590f5a8..8cd2a214d6f4 100644 --- a/py-polars/polars/utils/_construction.py +++ b/py-polars/polars/utils/_construction.py @@ -36,11 +36,11 @@ List, Null, Object, + String, Struct, Time, UInt32, Unknown, - Utf8, dtype_to_py_type, is_polars_dtype, numpy_char_code_to_dtype, @@ -1091,7 +1091,7 @@ def _sequence_of_sequence_to_pydf( unpack_nested = False for col, tp in local_schema_override.items(): if tp in (Categorical, Enum): - local_schema_override[col] = Utf8 + local_schema_override[col] = String elif not unpack_nested and (tp.base_type() in (Unknown, Struct)): unpack_nested = contains_nested( getattr(first_element, col, None).__class__, is_namedtuple @@ -1283,7 +1283,7 @@ def _establish_dataclass_or_model_schema( for col, tp in overrides.items(): if tp in (Categorical, Enum): - overrides[col] = Utf8 + overrides[col] = String elif not unpack_nested and (tp.base_type() in (Unknown, Struct)): unpack_nested = contains_nested( getattr(first_element, col, None), diff --git a/py-polars/polars/utils/udfs.py b/py-polars/polars/utils/udfs.py index de3c017b6663..6ca19fdf618d 100644 --- a/py-polars/polars/utils/udfs.py +++ b/py-polars/polars/utils/udfs.py @@ -133,7 +133,7 @@ class OpNames: ) # python functions that we can map to native expressions -_PYTHON_CASTS_MAP = {"float": "Float64", "int": "Int64", "str": "Utf8"} +_PYTHON_CASTS_MAP = {"float": "Float64", "int": "Int64", "str": "String"} _PYTHON_BUILTINS = frozenset(_PYTHON_CASTS_MAP) | {"abs"} _PYTHON_METHODS_MAP = { "lower": "str.to_lowercase", diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index bb33b078c7f2..678c85bdc64a 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -21,8 +21,8 @@ Decimal, Duration, Int64, + String, Time, - Utf8, ) from polars.dependencies import _check_for_numpy from polars.dependencies import numpy as np @@ -116,7 +116,7 @@ def is_str_sequence( elif _check_for_numpy(val) and isinstance(val, np.ndarray): return np.issubdtype(val.dtype, np.str_) elif include_series and isinstance(val, pl.Series): - return val.dtype == pl.Utf8 + return val.dtype == pl.String return isinstance(val, Sequence) and _is_iterable_of(val, str) @@ -276,9 +276,9 @@ def _cast_repr_strings_with_schema( tp: PolarsDataType | None if not df.is_empty(): for tp in df.schema.values(): - if tp != Utf8: + if tp != String: raise TypeError( - f"DataFrame should contain only Utf8 string repr data; found {tp!r}" + f"DataFrame should contain only String repr data; found {tp!r}" ) # duration string scaling @@ -367,7 +367,7 @@ def str_duration_(td: str | None) -> int | None: separator=".", ) ) - .cast(Utf8) + .cast(String) .cast(tp) ) elif tp != df.schema[c]: diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs index 10ce95c8db24..87267ff3e40e 100644 --- a/py-polars/src/conversion.rs +++ b/py-polars/src/conversion.rs @@ -347,7 +347,7 @@ impl ToPyObject for Wrap { class.call0().unwrap().into() }, DataType::String => { - let class = pl.getattr(intern!(py, "Utf8")).unwrap(); + let class = pl.getattr(intern!(py, "String")).unwrap(); class.call0().unwrap().into() }, DataType::Binary => { @@ -452,7 +452,7 @@ impl FromPyObject<'_> for Wrap { "Int16" => DataType::Int16, "Int32" => DataType::Int32, "Int64" => DataType::Int64, - "Utf8" => DataType::String, + "String" => DataType::String, "Binary" => DataType::Binary, "Boolean" => DataType::Boolean, "Categorical" => DataType::Categorical(None, Default::default()), @@ -490,7 +490,7 @@ impl FromPyObject<'_> for Wrap { "UInt16" => DataType::UInt16, "UInt32" => DataType::UInt32, "UInt64" => DataType::UInt64, - "Utf8" => DataType::String, + "String" => DataType::String, "Binary" => DataType::Binary, "Boolean" => DataType::Boolean, "Categorical" => { diff --git a/py-polars/tests/parametric/test_dataframe.py b/py-polars/tests/parametric/test_dataframe.py index 13dbfb228bb7..1895e7146530 100644 --- a/py-polars/tests/parametric/test_dataframe.py +++ b/py-polars/tests/parametric/test_dataframe.py @@ -41,7 +41,7 @@ def test_dtype_integer_cols(df: pl.DataFrame) -> None: min_size=1, min_cols=1, null_probability=0.25, - excluded_dtypes=[pl.Utf8, pl.List], + excluded_dtypes=[pl.String, pl.List], ) ) @example(df=pl.DataFrame(schema=["x", "y", "z"])) diff --git a/py-polars/tests/parametric/test_testing.py b/py-polars/tests/parametric/test_testing.py index 5d036000fa3d..6b47c43f35c4 100644 --- a/py-polars/tests/parametric/test_testing.py +++ b/py-polars/tests/parametric/test_testing.py @@ -78,7 +78,7 @@ def test_strategy_shape( ) ) def test_strategy_frame_columns(lf: pl.LazyFrame) -> None: - assert lf.schema == {"a": pl.UInt8, "b": pl.UInt8, "c": pl.Boolean, "d": pl.Utf8} + assert lf.schema == {"a": pl.UInt8, "b": pl.UInt8, "c": pl.Boolean, "d": pl.String} assert lf.columns == ["a", "b", "c", "d"] df = lf.collect() @@ -215,7 +215,7 @@ def finite_float(value: Any) -> bool: column( name="colz", strategy=create_list_strategy( - inner_dtype=pl.List(pl.Utf8), + inner_dtype=pl.List(pl.String), select_from=["aa", "bb", "cc"], min_size=1, ), @@ -227,7 +227,7 @@ def test_list_strategy(df: pl.DataFrame) -> None: assert df.schema == { "colx": pl.List(pl.UInt8), "coly": pl.List(pl.Datetime("ms")), - "colz": pl.List(pl.List(pl.Utf8)), + "colz": pl.List(pl.List(pl.String)), } uint8_max = (2**8) - 1 diff --git a/py-polars/tests/unit/dataframe/test_describe.py b/py-polars/tests/unit/dataframe/test_describe.py index a59f3343cc8a..95c458ad7387 100644 --- a/py-polars/tests/unit/dataframe/test_describe.py +++ b/py-polars/tests/unit/dataframe/test_describe.py @@ -80,7 +80,7 @@ def test_df_describe_nested() -> None: ("max", None, None), ], schema=["describe"] + df.columns, - schema_overrides={"struct": pl.Utf8, "list": pl.Utf8}, + schema_overrides={"struct": pl.String, "list": pl.String}, ) assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 2d85af8d1354..8baa99b37ac1 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -284,9 +284,9 @@ def test_from_arrow(monkeypatch: Any) -> None: assert df1.rows() == df.rows() assert df2.rows() == df.rows()[:3] - assert df0.schema == {"id": pl.Utf8, "points": pl.Int64} - assert df1.schema == {"x": pl.Utf8, "y": pl.Int32} - assert df2.schema == {"x": pl.Utf8, "y": pl.Int32} + assert df0.schema == {"id": pl.String, "points": pl.Int64} + assert df1.schema == {"x": pl.String, "y": pl.Int32} + assert df2.schema == {"x": pl.String, "y": pl.Int32} with pytest.raises(TypeError, match="Cannot convert str"): pl.from_arrow(data="xyz") @@ -887,11 +887,11 @@ def test_cast_frame() -> None: # cast via col:dtype map assert df.cast( - dtypes={"b": pl.Float32, "c": pl.Utf8, "d": pl.Datetime("ms")} + dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")} ).schema == { "a": pl.Float64, "b": pl.Float32, - "c": pl.Utf8, + "c": pl.String, "d": pl.Datetime("ms"), } @@ -899,7 +899,7 @@ def test_cast_frame() -> None: assert df.cast( { cs.numeric(): pl.UInt8, - cs.temporal(): pl.Utf8, + cs.temporal(): pl.String, } ).rows() == [ (1, 4, True, "2020-01-02"), @@ -908,7 +908,7 @@ def test_cast_frame() -> None: ] # cast all fields to a single type - assert df.cast(pl.Utf8).to_dict(as_series=False) == { + assert df.cast(pl.String).to_dict(as_series=False) == { "a": ["1.0", "2.5", "3.0"], "b": ["4", "5", None], "c": ["true", "false", "true"], @@ -1008,7 +1008,7 @@ def test_to_numpy_structured() -> None: ) df = pl.from_numpy(structured_array) assert df.schema == { - "product": pl.Utf8, + "product": pl.String, "price_usd": pl.Float64, "in_stock": pl.Boolean, } @@ -1089,7 +1089,7 @@ def test_literal_series() -> None: expected_schema = { "a": pl.Float64, "b": pl.Int8, - "c": pl.Utf8, + "c": pl.String, "d": pl.Datetime("ns"), "e": pl.Float32, } @@ -1194,7 +1194,7 @@ def __iter__(self) -> Iterator[Any]: ): assert_frame_equal(expected, generated_frame) assert generated_frame.schema == { - "a": pl.Utf8, + "a": pl.String, "b": pl.Int64, "c": pl.Int64, "d": pl.Int64, @@ -1204,7 +1204,12 @@ def __iter__(self) -> Iterator[Any]: cols = ["a", "b", ("c", pl.Int8), "d"] expected_data = [("0", 0, 1, 1), ("1", 1, 2, 3), ("2", 2, 4, 9), ("3", 3, 8, 27)] - expected_schema = [("a", pl.Utf8), ("b", pl.Int64), ("c", pl.Int8), ("d", pl.Int64)] + expected_schema = [ + ("a", pl.String), + ("b", pl.Int64), + ("c", pl.Int8), + ("d", pl.Int64), + ] for params in ( {"data": Rows(4)}, @@ -1226,7 +1231,7 @@ def __iter__(self) -> Iterator[Any]: data=iter(([{"col": None}] * 1000) + [{"col": ["a", "b", "c"]}]), infer_schema_length=1001, ) - assert df.schema == {"col": pl.List(pl.Utf8)} + assert df.schema == {"col": pl.List(pl.String)} assert df[-2:]["col"].to_list() == [None, ["a", "b", "c"]] # empty iterator @@ -1301,7 +1306,7 @@ def test_from_rows_of_dicts() -> None: } df2 = df_init(records, schema_overrides=overrides) assert df2.rows() == [(1, 100, "a"), (2, 101, "b")] - assert df2.schema == {"id": pl.Int16, "value": pl.Int32, "_meta": pl.Utf8} + assert df2.schema == {"id": pl.Int16, "value": pl.Int32, "_meta": pl.String} df3 = df_init(records, schema=overrides) assert df3.rows() == [(1, 100), (2, 101)] @@ -1658,9 +1663,9 @@ def test_group_by_agg_n_unique_floats() -> None: def test_select_by_dtype(df: pl.DataFrame) -> None: - out = df.select(pl.col(pl.Utf8)) + out = df.select(pl.col(pl.String)) assert out.columns == ["strings", "strings_nulls"] - out = df.select(pl.col([pl.Utf8, pl.Boolean])) + out = df.select(pl.col([pl.String, pl.Boolean])) assert out.columns == ["bools", "bools_nulls", "strings", "strings_nulls"] out = df.select(pl.col(INTEGER_DTYPES)) assert out.columns == ["int", "int_nulls"] @@ -1759,7 +1764,7 @@ def test_schema() -> None: df = pl.DataFrame( {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} ) - expected = {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.Utf8} + expected = {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String} assert df.schema == expected @@ -1894,8 +1899,8 @@ def test_fill_null() -> None: df.fill_null(strategy="max"), pl.DataFrame({"a": [1, 2], "b": [3, 3]}) ) - # utf8 and list data - # utf8 goes via binary + # string and list data + # string goes via binary df = pl.DataFrame( { "c": [ @@ -2396,7 +2401,7 @@ def test_empty_is_in() -> None: ) assert df_empty_isin.shape == (0, 1) assert df_empty_isin.rows() == [] - assert df_empty_isin.schema == {"foo": pl.Utf8} + assert df_empty_isin.schema == {"foo": pl.String} def test_group_by_slice_expression_args() -> None: @@ -2615,7 +2620,7 @@ def test_lower_bound_upper_bound(fruits_cars: pl.DataFrame) -> None: def test_selection_misc() -> None: - df = pl.DataFrame({"x": "abc"}, schema={"x": pl.Utf8}) + df = pl.DataFrame({"x": "abc"}, schema={"x": pl.String}) # literal values (as scalar/list) for zero in (0, [0]): @@ -3190,7 +3195,7 @@ def test_format_empty_df() -> None: ] ) assert df.shape == (0, 1) - assert df.dtypes == [pl.Utf8] + assert df.dtypes == [pl.String] def test_deadlocks_3409() -> None: diff --git a/py-polars/tests/unit/dataframe/test_from_dict.py b/py-polars/tests/unit/dataframe/test_from_dict.py index 10d18c567762..324829087086 100644 --- a/py-polars/tests/unit/dataframe/test_from_dict.py +++ b/py-polars/tests/unit/dataframe/test_from_dict.py @@ -71,7 +71,7 @@ def test_from_dict_with_scalars() -> None: "value": {0: "x", 1: "y", 2: "z"}.values(), }, schema={ - "value": pl.Utf8, + "value": pl.String, "other": pl.Float32, "misc": pl.Int32, "key": pl.Int8, @@ -85,7 +85,7 @@ def test_from_dict_with_scalars() -> None: "key": [1, 2, 3], } assert df4.schema == { - "value": pl.Utf8, + "value": pl.String, "other": pl.Float32, "misc": pl.Int32, "key": pl.Int8, @@ -106,7 +106,7 @@ def test_from_dict_with_scalars() -> None: assert df5.schema == { "x": pl.Struct([pl.Field("b", pl.Int64), pl.Field("c", pl.Int64)]), "y": pl.Int8, - "z": pl.Utf8, + "z": pl.String, } # mixed with numpy cols... @@ -182,7 +182,7 @@ def test_from_dict_with_scalars_mixed() -> None: "h": pl.Date, "i": pl.Duration, "j": pl.Datetime, - "k": pl.Utf8, + "k": pl.String, }, ) dfx = df8.select(pl.exclude("idx")) diff --git a/py-polars/tests/unit/datatypes/test_array.py b/py-polars/tests/unit/datatypes/test_array.py index aa886aff003d..ebdd6c2ce4a8 100644 --- a/py-polars/tests/unit/datatypes/test_array.py +++ b/py-polars/tests/unit/datatypes/test_array.py @@ -58,7 +58,7 @@ def test_array_construction() -> None: {"row_id": "a", "data": [1, 2, 3]}, {"row_id": "b", "data": [2, 3, 4]}, ] - schema = {"row_id": pl.Utf8(), "data": pl.Array(inner=pl.Int64, width=3)} + schema = {"row_id": pl.String(), "data": pl.Array(inner=pl.Int64, width=3)} df = pl.from_dicts(rows, schema=schema) assert df.schema == schema assert df.rows() == [("a", [1, 2, 3]), ("b", [2, 3, 4])] @@ -159,7 +159,7 @@ def test_array_data_type_equality() -> None: assert pl.Array(pl.Int64, 2) == pl.Array assert pl.Array(pl.Int64, 2) == pl.Array(pl.Int64, 2) assert pl.Array(pl.Int64, 2) != pl.Array(pl.Int64, 3) - assert pl.Array(pl.Int64, 2) != pl.Array(pl.Utf8, 2) + assert pl.Array(pl.Int64, 2) != pl.Array(pl.String, 2) assert pl.Array(pl.Int64, 2) != pl.List(pl.Int64) @@ -169,7 +169,7 @@ def test_array_data_type_equality() -> None: ([[1, 2], None, [3, None], [None, None]], pl.Int64), ([[True, False], None, [True, None], [None, None]], pl.Boolean), ([[1.0, 2.0], None, [3.0, None], [None, None]], pl.Float32), - ([["a", "b"], None, ["c", None], [None, None]], pl.Utf8), + ([["a", "b"], None, ["c", None], [None, None]], pl.String), ([[[1, 2], None], None, [[3], None], [None, None]], pl.List(pl.Int32)), ], ) diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index cfb569be0ea3..a0469c732dc4 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -66,8 +66,8 @@ def test_categorical_outer_join() -> None: df = dfa.join(dfb, on="key", how="outer") # the cast is important to test the rev map - assert df["key"].cast(pl.Utf8).to_list() == ["bar", None, "foo"] - assert df["key_right"].cast(pl.Utf8).to_list() == ["bar", "baz", None] + assert df["key"].cast(pl.String).to_list() == ["bar", None, "foo"] + assert df["key_right"].cast(pl.String).to_list() == ["bar", "baz", None] def test_read_csv_categorical() -> None: @@ -141,7 +141,7 @@ def test_categorical_equality( s = pl.Series(["a", "b", "c", "c", None, None], dtype=pl.Categorical) s2 = pl.Series("b_cat", ["a", "b", "c", "a", "b", "c"], dtype=pl.Categorical) assert_series_equal(op(s, s2), expected) - assert_series_equal(op(s, s2.cast(pl.Utf8)), expected) + assert_series_equal(op(s, s2.cast(pl.String)), expected) @pytest.mark.parametrize( @@ -160,7 +160,7 @@ def test_categorical_equality_global_fastpath( s = pl.Series(["a", "b", "c", "c", None, None], dtype=pl.Categorical) s2 = pl.Series(["d"], dtype=pl.Categorical) assert_series_equal(op(s, s2), expected) - assert_series_equal(op(s, s2.cast(pl.Utf8)), expected) + assert_series_equal(op(s, s2.cast(pl.String)), expected) @pytest.mark.parametrize( @@ -229,7 +229,7 @@ def test_categorical_global_ordering_broadcast_rhs( s = s.cast(pl.Categorical("lexical")) s2 = s2.cast(pl.Categorical("lexical")) assert_series_equal(op(s, s2), expected_lexical) - assert_series_equal(op(s, s2.cast(pl.Utf8)), expected_lexical) + assert_series_equal(op(s, s2.cast(pl.String)), expected_lexical) @pytest.mark.parametrize( @@ -258,7 +258,7 @@ def test_categorical_global_ordering_broadcast_lhs( s = s.cast(pl.Categorical("lexical")) s2 = s2.cast(pl.Categorical("lexical")) assert_series_equal(op(s, s2), expected_lexical) - assert_series_equal(op(s, s2.cast(pl.Utf8)), expected_lexical) + assert_series_equal(op(s, s2.cast(pl.String)), expected_lexical) @pytest.mark.parametrize( @@ -340,7 +340,7 @@ def test_compare_categorical_single_non_existent( assert_series_equal(op(s, s2), expected) # type: ignore[arg-type] s_cat = pl.Series(["d"], dtype=pl.Categorical) assert_series_equal(op(s, s_cat), expected) - assert_series_equal(op(s, s_cat.cast(pl.Utf8)), expected) + assert_series_equal(op(s, s_cat.cast(pl.String)), expected) @pytest.mark.parametrize( @@ -381,7 +381,7 @@ def test_compare_categorical_single_none( s = pl.Series([None, "a", "b", "c", "b", "a"], dtype=pl.Categorical) s2 = pl.Series([None], dtype=pl.Categorical) assert_series_equal(op(s, s2), expected) - assert_series_equal(op(s, s2.cast(pl.Utf8)), expected) + assert_series_equal(op(s, s2.cast(pl.String)), expected) def test_categorical_error_on_local_cmp() -> None: @@ -484,7 +484,7 @@ def test_stringcache() -> None: with pl.StringCache(): # create a large enough column that the categorical map is reallocated df = pl.DataFrame({"cats": pl.arange(0, N, eager=True)}).select( - [pl.col("cats").cast(pl.Utf8).cast(pl.Categorical)] + [pl.col("cats").cast(pl.String).cast(pl.Categorical)] ) assert df.filter(pl.col("cats").is_in(["1", "2"])).to_dict(as_series=False) == { "cats": ["1", "2"] @@ -668,7 +668,7 @@ def test_list_builder_different_categorical_rev_maps() -> None: def test_categorical_collect_11408() -> None: df = pl.DataFrame( data={"groups": ["a", "b", "c"], "cats": ["a", "b", "c"], "amount": [1, 2, 3]}, - schema={"groups": pl.Utf8, "cats": pl.Categorical, "amount": pl.Int8}, + schema={"groups": pl.String, "cats": pl.Categorical, "amount": pl.Int8}, ) assert df.group_by("groups").agg( diff --git a/py-polars/tests/unit/datatypes/test_decimal.py b/py-polars/tests/unit/datatypes/test_decimal.py index ca94fcfacf01..07c31313a48b 100644 --- a/py-polars/tests/unit/datatypes/test_decimal.py +++ b/py-polars/tests/unit/datatypes/test_decimal.py @@ -145,7 +145,7 @@ def test_decimal_scale_precision_roundtrip(monkeypatch: Any) -> None: assert pl.from_arrow(pl.Series("dec", [D("10.0")]).to_arrow()).item() == D("10.0") -def test_utf8_to_decimal() -> None: +def test_string_to_decimal() -> None: s = pl.Series( [ "40.12", @@ -180,7 +180,7 @@ def test_read_csv_decimal(monkeypatch: Any) -> None: 0.01,a""" df = pl.read_csv(csv.encode(), dtypes={"a": pl.Decimal(scale=2)}) - assert df.dtypes == [pl.Decimal(precision=None, scale=2), pl.Utf8] + assert df.dtypes == [pl.Decimal(precision=None, scale=2), pl.String] assert df["a"].to_list() == [ D("123.12"), D("1.10"), diff --git a/py-polars/tests/unit/datatypes/test_enum.py b/py-polars/tests/unit/datatypes/test_enum.py index 78a897fc54e8..52b9f4a10761 100644 --- a/py-polars/tests/unit/datatypes/test_enum.py +++ b/py-polars/tests/unit/datatypes/test_enum.py @@ -178,7 +178,7 @@ def test_equality_enum( s2 = pl.Series([None, "c", "b", "c"], dtype=dtype) assert_series_equal(op(s, s2), expected) - assert_series_equal(op(s, s2.cast(pl.Utf8)), expected) + assert_series_equal(op(s, s2.cast(pl.String)), expected) @pytest.mark.parametrize( @@ -215,10 +215,10 @@ def test_equality_missing_enum_scalar() -> None: expected = pl.Series("cmp", [False, False, False, True], dtype=pl.Boolean) assert_series_equal(out, expected) - out_utf8 = df.select(pl.col("a").eq_missing(pl.lit("c")).alias("cmp")).get_column( + out_str = df.select(pl.col("a").eq_missing(pl.lit("c")).alias("cmp")).get_column( "cmp" ) - assert_series_equal(out_utf8, expected) + assert_series_equal(out_str, expected) out = df.select( pl.col("a").ne_missing(pl.lit("c", dtype=dtype)).alias("cmp") @@ -226,10 +226,10 @@ def test_equality_missing_enum_scalar() -> None: expected = pl.Series("cmp", [True, True, True, False], dtype=pl.Boolean) assert_series_equal(out, expected) - out_utf8 = df.select(pl.col("a").ne_missing(pl.lit("c")).alias("cmp")).get_column( + out_str = df.select(pl.col("a").ne_missing(pl.lit("c")).alias("cmp")).get_column( "cmp" ) - assert_series_equal(out_utf8, expected) + assert_series_equal(out_str, expected) def test_equality_missing_enum_none_scalar() -> None: diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 80536f3f3dd9..c498b6861b59 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -182,7 +182,7 @@ def test_list_diagonal_concat() -> None: def test_inner_type_categorical_on_rechunk() -> None: df = pl.DataFrame({"cats": ["foo", "bar"]}).select( - pl.col(pl.Utf8).cast(pl.Categorical).implode() + pl.col(pl.String).cast(pl.Categorical).implode() ) assert pl.concat([df, df], rechunk=True).dtypes == [pl.List(pl.Categorical)] @@ -267,11 +267,11 @@ def test_fast_explode_on_list_struct_6208() -> None: df = pl.DataFrame( data, schema={ - "label": pl.Utf8, - "tag": pl.Utf8, + "label": pl.String, + "tag": pl.String, "ref": pl.Int64, "parents": pl.List( - pl.Struct({"ref": pl.Int64, "tag": pl.Utf8, "ratio": pl.Float64}) + pl.Struct({"ref": pl.Int64, "tag": pl.String, "ratio": pl.Float64}) ), }, ) @@ -480,7 +480,7 @@ def test_logical_parallel_list_collect() -> None: .explode("Values") .unnest("Values") ) - assert out.dtypes == [pl.Utf8, pl.Categorical, pl.UInt32] + assert out.dtypes == [pl.String, pl.Categorical, pl.UInt32] assert out.to_dict(as_series=False) == { "Group": ["GroupA", "GroupA"], "Values": ["Value1", "Value2"], @@ -502,7 +502,7 @@ def test_list_recursive_categorical_cast() -> None: [ ([None, 1, 2], [None, [1], [2]], pl.Int64), ([None, 1.0, 2.0], [None, [1.0], [2.0]], pl.Float64), - ([None, "x", "y"], [None, ["x"], ["y"]], pl.Utf8), + ([None, "x", "y"], [None, ["x"], ["y"]], pl.String), ([None, True, False], [None, [True], [False]], pl.Boolean), ], ) @@ -604,7 +604,7 @@ def test_list_inner_cast_physical_11513() -> None: @pytest.mark.parametrize( - ("dtype", "expected"), [(pl.List, True), (pl.Struct, True), (pl.Utf8, False)] + ("dtype", "expected"), [(pl.List, True), (pl.Struct, True), (pl.String, False)] ) def test_datatype_is_nested(dtype: PolarsDataType, expected: bool) -> None: assert dtype.is_nested() is expected @@ -621,7 +621,7 @@ def test_list_series_construction_with_dtype_11849_11878() -> None: s = pl.Series( "groups", [[{"1": "A", "2": None}], [{"1": "B", "2": "C"}, {"1": "D", "2": "E"}]], - dtype=pl.List(pl.Struct([pl.Field("1", pl.Utf8), pl.Field("2", pl.Utf8)])), + dtype=pl.List(pl.Struct([pl.Field("1", pl.String), pl.Field("2", pl.String)])), ) assert s.to_list() == [ diff --git a/py-polars/tests/unit/datatypes/test_object.py b/py-polars/tests/unit/datatypes/test_object.py index 0c0342e2f4de..2c4cc070af6c 100644 --- a/py-polars/tests/unit/datatypes/test_object.py +++ b/py-polars/tests/unit/datatypes/test_object.py @@ -99,4 +99,4 @@ def test_object_row_construction() -> None: def test_object_apply_to_struct() -> None: s = pl.Series([0, 1, 2], dtype=pl.Object) out = s.map_elements(lambda x: {"a": str(x), "b": x}) - assert out.dtype == pl.Struct([pl.Field("a", pl.Utf8), pl.Field("b", pl.Int64)]) + assert out.dtype == pl.Struct([pl.Field("a", pl.String), pl.Field("b", pl.Int64)]) diff --git a/py-polars/tests/unit/datatypes/test_string.py b/py-polars/tests/unit/datatypes/test_string.py new file mode 100644 index 000000000000..ce63c4b6b79c --- /dev/null +++ b/py-polars/tests/unit/datatypes/test_string.py @@ -0,0 +1,30 @@ +import polars as pl +from polars.testing import assert_series_equal + + +def test_series_init_string() -> None: + s = pl.Series(["a", "b"]) + assert s.dtype == pl.String + + +def test_utf8_alias_eq() -> None: + assert pl.Utf8 == pl.String + assert pl.Utf8 == pl.String() + assert pl.Utf8() == pl.String + assert pl.Utf8() == pl.String() + + +def test_utf8_alias_hash() -> None: + assert hash(pl.Utf8) == hash(pl.String) + assert hash(pl.Utf8()) == hash(pl.String()) + + +def test_utf8_alias_series_init() -> None: + s = pl.Series(["a", "b"], dtype=pl.Utf8) + assert s.dtype == pl.String + + +def test_utf8_alias_lit() -> None: + result = pl.select(a=pl.lit(5, dtype=pl.Utf8)).to_series() + expected = pl.Series("a", ["5"], dtype=pl.String) + assert_series_equal(result, expected) diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 5c3eddc6d534..f92bf5f5d9e5 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -164,7 +164,7 @@ def test_struct_function_expansion() -> None: df = pl.DataFrame( {"a": [1, 2, 3, 4], "b": ["one", "two", "three", "four"], "c": [9, 8, 7, 6]} ) - struct_schema = {"a": pl.UInt32, "b": pl.Utf8} + struct_schema = {"a": pl.UInt32, "b": pl.String} s = df.with_columns(pl.struct(pl.col(["a", "b"]), schema=struct_schema))["a"] assert isinstance(s, pl.Series) @@ -644,7 +644,7 @@ def test_empty_struct() -> None: [ pl.List, pl.List(pl.Null), - pl.List(pl.Utf8), + pl.List(pl.String), pl.Array(pl.Null, 32), pl.Array(pl.UInt8, 16), pl.Struct, @@ -699,7 +699,7 @@ def test_struct_null_cast() -> None: dtype = pl.Struct( [ pl.Field("a", pl.Int64), - pl.Field("b", pl.Utf8), + pl.Field("b", pl.String), pl.Field("c", pl.List(pl.Float64)), ] ) diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index 6a750532bba4..14ae91f7ea66 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -938,7 +938,7 @@ def test_asof_join() -> None: ).set_sorted("dates") assert trades.schema == { "dates": pl.Datetime("ms"), - "ticker": pl.Utf8, + "ticker": pl.String, "bid": pl.Float64, } out = trades.join_asof(quotes, on="dates", strategy="backward") @@ -947,8 +947,8 @@ def test_asof_join() -> None: "bid": pl.Float64, "bid_right": pl.Float64, "dates": pl.Datetime("ms"), - "ticker": pl.Utf8, - "ticker_right": pl.Utf8, + "ticker": pl.String, + "ticker_right": pl.String, } assert out.columns == ["dates", "ticker", "bid", "ticker_right", "bid_right"] assert (out["dates"].cast(int)).to_list() == [ @@ -1556,7 +1556,11 @@ def test_strptime_with_tz() -> None: ], ) def test_strptime_empty(time_unit: TimeUnit, time_zone: str | None) -> None: - ts = pl.Series([None]).cast(pl.Utf8).str.strptime(pl.Datetime(time_unit, time_zone)) + ts = ( + pl.Series([None]) + .cast(pl.String) + .str.strptime(pl.Datetime(time_unit, time_zone)) + ) assert ts.dtype == pl.Datetime(time_unit, time_zone) @@ -2245,7 +2249,7 @@ def test_truncate_propagate_null() -> None: ) == {"date": [None, None, datetime(2022, 3, 20, 5, 7, 0)]} assert df.select( pl.col("date").dt.truncate( - every=pl.lit(None, dtype=pl.Utf8), + every=pl.lit(None, dtype=pl.String), ) ).to_dict(as_series=False) == {"date": [None, None, None]} diff --git a/py-polars/tests/unit/datatypes/test_time.py b/py-polars/tests/unit/datatypes/test_time.py index 67404efb1420..c7915a342db6 100644 --- a/py-polars/tests/unit/datatypes/test_time.py +++ b/py-polars/tests/unit/datatypes/test_time.py @@ -3,7 +3,7 @@ import polars as pl -def test_time_to_utf8_cast() -> None: +def test_time_to_string_cast() -> None: assert pl.Series([time(12, 1, 1)]).cast(str).to_list() == ["12:01:01"] diff --git a/py-polars/tests/unit/functions/as_datatype/test_as_datatype.py b/py-polars/tests/unit/functions/as_datatype/test_as_datatype.py index fdefc588e59d..d352dae78b00 100644 --- a/py-polars/tests/unit/functions/as_datatype/test_as_datatype.py +++ b/py-polars/tests/unit/functions/as_datatype/test_as_datatype.py @@ -341,7 +341,7 @@ def test_struct_from_schema_only() -> None: s = df.select( pl.struct( schema={ - "str": pl.Utf8, + "str": pl.String, "u8": pl.UInt8, "i32": pl.Int32, "f64": pl.Float64, @@ -356,7 +356,7 @@ def test_struct_from_schema_only() -> None: # check dtypes assert s.dtype == pl.Struct( [ - pl.Field("str", pl.Utf8), + pl.Field("str", pl.String), pl.Field("u8", pl.UInt8), pl.Field("i32", pl.Int32), pl.Field("f64", pl.Float64), @@ -524,7 +524,7 @@ def test_concat_str_with_non_utf8_col() -> None: .select(pl.concat_str(["a", "b"], separator="-").fill_null(pl.col("a"))) .collect() ) - expected = pl.Series("a", ["0-x"], dtype=pl.Utf8) + expected = pl.Series("a", ["0-x"], dtype=pl.String) assert_series_equal(out.to_series(), expected) diff --git a/py-polars/tests/unit/functions/range/test_datetime_range.py b/py-polars/tests/unit/functions/range/test_datetime_range.py index 6160d7a01a66..23cee07c98a6 100644 --- a/py-polars/tests/unit/functions/range/test_datetime_range.py +++ b/py-polars/tests/unit/functions/range/test_datetime_range.py @@ -69,7 +69,7 @@ def test_datetime_range() -> None: assert len(result) == 61 assert result.dtype.time_unit == "ns" # type: ignore[attr-defined] assert result.dt.second()[-1] == 59 - assert result.cast(pl.Utf8)[-1] == "2022-01-01 00:00:59.247379260" + assert result.cast(pl.String)[-1] == "2022-01-01 00:00:59.247379260" @pytest.mark.parametrize( diff --git a/py-polars/tests/unit/functions/test_col.py b/py-polars/tests/unit/functions/test_col.py index a450782447da..759c91eb7720 100644 --- a/py-polars/tests/unit/functions/test_col.py +++ b/py-polars/tests/unit/functions/test_col.py @@ -26,7 +26,7 @@ def test_col_select() -> None: assert df.select(pl.col("hamburger", "foo")).columns == ["hamburger", "foo"] assert df.select(pl.col(pl.Series(["ham", "foo"]))).columns == ["ham", "foo"] # Dtypes - assert df.select(pl.col(pl.Utf8)).columns == ["bar"] + assert df.select(pl.col(pl.String)).columns == ["bar"] assert df.select(pl.col(pl.Int64, pl.Float64)).columns == [ "ham", "hamburger", diff --git a/py-polars/tests/unit/functions/test_concat.py b/py-polars/tests/unit/functions/test_concat.py index a35b4a1658c9..69f400e086a3 100644 --- a/py-polars/tests/unit/functions/test_concat.py +++ b/py-polars/tests/unit/functions/test_concat.py @@ -25,5 +25,5 @@ def test_concat_lf_stack_overflow() -> None: def test_empty_df_concat_str_11701() -> None: df = pl.DataFrame({"a": []}) - out = df.select(pl.concat_str([pl.col("a").cast(pl.Utf8), pl.lit("x")])) - assert_frame_equal(out, pl.DataFrame({"a": []}, schema={"a": pl.Utf8})) + out = df.select(pl.concat_str([pl.col("a").cast(pl.String), pl.lit("x")])) + assert_frame_equal(out, pl.DataFrame({"a": []}, schema={"a": pl.String})) diff --git a/py-polars/tests/unit/functions/test_repeat.py b/py-polars/tests/unit/functions/test_repeat.py index 03044d0f1405..fe060bd2c32d 100644 --- a/py-polars/tests/unit/functions/test_repeat.py +++ b/py-polars/tests/unit/functions/test_repeat.py @@ -16,7 +16,7 @@ (2**31 - 1, 5, None, pl.Int32), (-(2**31) - 1, 3, None, pl.Int64), (-(2**31), 3, None, pl.Int32), - ("foo", 2, None, pl.Utf8), + ("foo", 2, None, pl.String), (1.0, 5, None, pl.Float64), (True, 4, None, pl.Boolean), (None, 7, None, pl.Null), diff --git a/py-polars/tests/unit/functions/test_whenthen.py b/py-polars/tests/unit/functions/test_whenthen.py index 42dcf0c17fa7..2e85bd76108d 100644 --- a/py-polars/tests/unit/functions/test_whenthen.py +++ b/py-polars/tests/unit/functions/test_whenthen.py @@ -429,7 +429,7 @@ def test_when_then_nested_non_unit_literal_predicate_agg_broadcast_12242() -> No expect = pl.DataFrame( [ - pl.Series("array_name", ["A", "B"], dtype=pl.Utf8), + pl.Series("array_name", ["A", "B"], dtype=pl.String), pl.Series( "array_val", [[1, None, None, 2, None, 3], [4, None, None, None, None, 5]], @@ -463,7 +463,7 @@ def test_when_then_binary_op_predicate_agg_12526() -> None: ) expect = pl.DataFrame( - {"a": [1], "col": [None]}, schema={"a": pl.Int64, "col": pl.Utf8} + {"a": [1], "col": [None]}, schema={"a": pl.Int64, "col": pl.String} ) actual = df.group_by("a").agg( diff --git a/py-polars/tests/unit/interchange/test_buffer.py b/py-polars/tests/unit/interchange/test_buffer.py index b952a77782b6..0cc1fd393763 100644 --- a/py-polars/tests/unit/interchange/test_buffer.py +++ b/py-polars/tests/unit/interchange/test_buffer.py @@ -38,7 +38,7 @@ def test_init_invalid_input() -> None: (pl.Series([1, 2], dtype=pl.Int8), 2), (pl.Series([1, 2], dtype=pl.Int64), 16), (pl.Series([1.4, 2.9, 3.0], dtype=pl.Float32), 12), - (pl.Series(["a", "bc", "éâç"], dtype=pl.Utf8), 9), + (pl.Series(["a", "bc", "éâç"], dtype=pl.String), 9), (pl.Series(["a", "b", "a", "c", "a"], dtype=pl.Categorical), 20), (pl.Series([True, False], dtype=pl.Boolean), 1), (pl.Series([True] * 9, dtype=pl.Boolean), 2), diff --git a/py-polars/tests/unit/interchange/test_column.py b/py-polars/tests/unit/interchange/test_column.py index 73814b0c0cc0..24213b7f5623 100644 --- a/py-polars/tests/unit/interchange/test_column.py +++ b/py-polars/tests/unit/interchange/test_column.py @@ -71,7 +71,7 @@ def test_describe_categorical_lexical_ordering() -> None: def test_describe_categorical_other_dtype() -> None: - s = pl.Series(["a", "b", "a"], dtype=pl.Utf8) + s = pl.Series(["a", "b", "a"], dtype=pl.String) col = PolarsColumn(s) with pytest.raises(TypeError): col.describe_categorical @@ -241,7 +241,7 @@ def test_get_buffers_chunked_zero_copy_fails() -> None: (DtypeKind.FLOAT, 64, "g", "="), ), ( - pl.Series(["a", "bc", None, "éâç"], dtype=pl.Utf8), + pl.Series(["a", "bc", None, "éâç"], dtype=pl.String), pl.Series([97, 98, 99, 195, 169, 195, 162, 195, 167], dtype=pl.UInt8), (DtypeKind.STRING, 8, "U", "="), ), diff --git a/py-polars/tests/unit/interchange/test_roundtrip.py b/py-polars/tests/unit/interchange/test_roundtrip.py index e560357f33d4..5183acd983b7 100644 --- a/py-polars/tests/unit/interchange/test_roundtrip.py +++ b/py-polars/tests/unit/interchange/test_roundtrip.py @@ -22,7 +22,7 @@ pl.Float32, pl.Float64, pl.Boolean, - pl.Utf8, + pl.String, pl.Datetime, pl.Categorical, ] diff --git a/py-polars/tests/unit/interchange/test_utils.py b/py-polars/tests/unit/interchange/test_utils.py index 8c6bae00d015..b4469c695f51 100644 --- a/py-polars/tests/unit/interchange/test_utils.py +++ b/py-polars/tests/unit/interchange/test_utils.py @@ -28,7 +28,7 @@ (pl.Float32, (DtypeKind.FLOAT, 32, "f", NE)), (pl.Float64, (DtypeKind.FLOAT, 64, "g", NE)), (pl.Boolean, (DtypeKind.BOOL, 1, "b", NE)), - (pl.Utf8, (DtypeKind.STRING, 8, "U", NE)), + (pl.String, (DtypeKind.STRING, 8, "U", NE)), (pl.Date, (DtypeKind.DATETIME, 32, "tdD", NE)), (pl.Time, (DtypeKind.DATETIME, 64, "ttu", NE)), (pl.Categorical, (DtypeKind.CATEGORICAL, 32, "I", NE)), diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index 1cb99cbc7153..f6544d78cfb4 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -28,10 +28,10 @@ ("float64", [21.7, 21.8, 21], pl.Float64, np.float64), ("bool", [True, False, False], pl.Boolean, np.bool_), ("object", [21.7, "string1", object()], pl.Object, np.object_), - ("str", ["string1", "string2", "string3"], pl.Utf8, np.str_), + ("str", ["string1", "string2", "string3"], pl.String, np.str_), ("intc", [1, 3, 2], pl.Int32, np.intc), ("uintc", [1, 3, 2], pl.UInt32, np.uintc), - ("str_fixed", ["string1", "string2", "string3"], pl.Utf8, np.str_), + ("str_fixed", ["string1", "string2", "string3"], pl.String, np.str_), ( "bytes", [b"byte_string1", b"byte_string2", b"byte_string3"], @@ -69,11 +69,11 @@ def test_to_numpy(numpy_interop_test_data: Any, use_pyarrow: bool) -> None: @pytest.mark.parametrize("use_pyarrow", [True, False]) @pytest.mark.parametrize("has_null", [True, False]) -@pytest.mark.parametrize("dtype", [pl.Time, pl.Boolean, pl.Utf8]) +@pytest.mark.parametrize("dtype", [pl.Time, pl.Boolean, pl.String]) def test_to_numpy_no_zero_copy( use_pyarrow: bool, has_null: bool, dtype: pl.PolarsDataType ) -> None: - data: list[Any] = ["a", None] if dtype == pl.Utf8 else [0, None] + data: list[Any] = ["a", None] if dtype == pl.String else [0, None] series = pl.Series(data if has_null else data[:1], dtype=dtype) with pytest.raises(ValueError): series.to_numpy(zero_copy_only=True, use_pyarrow=use_pyarrow) @@ -112,8 +112,8 @@ def test_from_pandas() -> None: "int_nulls": pl.Float64, "floats": pl.Float64, "floats_nulls": pl.Float64, - "strings": pl.Utf8, - "strings_nulls": pl.Utf8, + "strings": pl.String, + "strings_nulls": pl.String, "strings-cat": pl.Categorical, } assert out.rows() == [ @@ -280,12 +280,12 @@ def test_from_pandas_null() -> None: # null column is an object dtype, so pl.Utf8 is most close df = pd.DataFrame([{"a": None}, {"a": None}]) out = pl.DataFrame(df) - assert out.dtypes == [pl.Utf8] + assert out.dtypes == [pl.String] assert out["a"][0] is None df = pd.DataFrame([{"a": None, "b": 1}, {"a": None, "b": 2}]) out = pl.DataFrame(df) - assert out.dtypes == [pl.Utf8, pl.Int64] + assert out.dtypes == [pl.String, pl.Int64] def test_from_pandas_nested_list() -> None: @@ -339,16 +339,16 @@ def test_from_dicts() -> None: def test_from_dict_no_inference() -> None: - schema = {"a": pl.Utf8} + schema = {"a": pl.String} data = [{"a": "aa"}] pl.from_dicts(data, schema_overrides=schema, infer_schema_length=0) def test_from_dicts_schema_override() -> None: schema = { - "a": pl.Utf8, + "a": pl.String, "b": pl.Int64, - "c": pl.List(pl.Struct({"x": pl.Int64, "y": pl.Utf8, "z": pl.Float64})), + "c": pl.List(pl.Struct({"x": pl.Int64, "y": pl.String, "z": pl.Float64})), } # initial data matches the expected schema @@ -448,7 +448,7 @@ def test_from_numpy_structured() -> None: df = pl.DataFrame(data=arr).sort(by="price_usd", descending=True) assert df.schema == { - "product": pl.Utf8, + "product": pl.String, "price_usd": pl.Float64, "in_stock": pl.Boolean, } @@ -465,7 +465,7 @@ def test_from_numpy_structured() -> None: ), ): assert df.schema == { - "phone": pl.Utf8, + "phone": pl.String, "price_usd": pl.Float32, "available": pl.Boolean, } @@ -648,7 +648,7 @@ def test_from_empty_pandas_with_dtypes() -> None: df = pd.DataFrame(columns=["a", "b"]) df["a"] = df["a"].astype(str) df["b"] = df["b"].astype(float) - assert pl.from_pandas(df).dtypes == [pl.Utf8, pl.Float64] + assert pl.from_pandas(df).dtypes == [pl.String, pl.Float64] df = pl.DataFrame( data=[], @@ -657,7 +657,7 @@ def test_from_empty_pandas_with_dtypes() -> None: "b": pl.Datetime, "c": pl.Float32, "d": pl.Duration, - "e": pl.Utf8, + "e": pl.String, }, ).to_pandas() @@ -666,7 +666,7 @@ def test_from_empty_pandas_with_dtypes() -> None: pl.Datetime, pl.Float32, pl.Duration, - pl.Utf8, + pl.String, ] @@ -862,7 +862,7 @@ def test_dataframe_from_repr() -> None: "b": pl.Float64, "c": pl.Categorical, "d": pl.Boolean, - "e": pl.Utf8, + "e": pl.String, "f": pl.Date, "g": pl.Time, "h": pl.Datetime("ns"), @@ -887,7 +887,7 @@ def test_dataframe_from_repr() -> None: assert df.shape == (0, 6) assert df.rows() == [] assert df.schema == { - "id": pl.Utf8, + "id": pl.String, "q1": pl.Int8, "q2": pl.Int16, "q3": pl.Int32, @@ -907,7 +907,7 @@ def test_dataframe_from_repr() -> None: """ ), ) - assert_frame_equal(df, pl.DataFrame(schema={"misc": pl.Utf8, "other": pl.Utf8})) + assert_frame_equal(df, pl.DataFrame(schema={"misc": pl.String, "other": pl.String})) # empty frame with non-standard/blank 'null' df = cast( @@ -1013,7 +1013,7 @@ def test_dataframe_from_repr() -> None: assert df.schema == { "source_actor_id": pl.Int32, "source_channel_id": pl.Int64, - "ident": pl.Utf8, + "ident": pl.String, "timestamp": pl.Datetime("us", "Asia/Tokyo"), } diff --git a/py-polars/tests/unit/io/test_avro.py b/py-polars/tests/unit/io/test_avro.py index af6aa4332191..d8856d6dc5c8 100644 --- a/py-polars/tests/unit/io/test_avro.py +++ b/py-polars/tests/unit/io/test_avro.py @@ -19,7 +19,7 @@ @pytest.fixture() def example_df() -> pl.DataFrame: - return pl.DataFrame({"i64": [1, 2], "f64": [0.1, 0.2], "utf8": ["a", "b"]}) + return pl.DataFrame({"i64": [1, 2], "f64": [0.1, 0.2], "str": ["a", "b"]}) @pytest.mark.parametrize("compression", COMPRESSIONS) diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index b27cd75b3ab1..4cf64d3aa065 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -307,8 +307,8 @@ def test_partial_dtype_overwrite() -> None: """ ) f = io.StringIO(csv) - df = pl.read_csv(f, dtypes=[pl.Utf8]) - assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64] + df = pl.read_csv(f, dtypes=[pl.String]) + assert df.dtypes == [pl.String, pl.Int64, pl.Int64] def test_dtype_overwrite_with_column_name_selection() -> None: @@ -320,8 +320,8 @@ def test_dtype_overwrite_with_column_name_selection() -> None: """ ) f = io.StringIO(csv) - df = pl.read_csv(f, columns=["c", "b", "d"], dtypes=[pl.Int32, pl.Utf8]) - assert df.dtypes == [pl.Utf8, pl.Int32, pl.Int64] + df = pl.read_csv(f, columns=["c", "b", "d"], dtypes=[pl.Int32, pl.String]) + assert df.dtypes == [pl.String, pl.Int32, pl.Int64] def test_dtype_overwrite_with_column_idx_selection() -> None: @@ -333,10 +333,10 @@ def test_dtype_overwrite_with_column_idx_selection() -> None: """ ) f = io.StringIO(csv) - df = pl.read_csv(f, columns=[2, 1, 3], dtypes=[pl.Int32, pl.Utf8]) - # Columns without an explicit dtype set will get pl.Utf8 if dtypes is a list + df = pl.read_csv(f, columns=[2, 1, 3], dtypes=[pl.Int32, pl.String]) + # Columns without an explicit dtype set will get pl.String if dtypes is a list # if the column selection is done with column indices instead of column names. - assert df.dtypes == [pl.Utf8, pl.Int32, pl.Utf8] + assert df.dtypes == [pl.String, pl.Int32, pl.String] # Projections are sorted. assert df.columns == ["b", "c", "d"] @@ -433,18 +433,18 @@ def test_column_rename_and_dtype_overwrite() -> None: df = pl.read_csv( f, new_columns=["A", "B", "C"], - dtypes={"A": pl.Utf8, "B": pl.Int64, "C": pl.Float32}, + dtypes={"A": pl.String, "B": pl.Int64, "C": pl.Float32}, ) - assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float32] + assert df.dtypes == [pl.String, pl.Int64, pl.Float32] f = io.StringIO(csv) df = pl.read_csv( f, columns=["a", "c"], new_columns=["A", "C"], - dtypes={"A": pl.Utf8, "C": pl.Float32}, + dtypes={"A": pl.String, "C": pl.Float32}, ) - assert df.dtypes == [pl.Utf8, pl.Float32] + assert df.dtypes == [pl.String, pl.Float32] csv = textwrap.dedent( """\ @@ -456,10 +456,10 @@ def test_column_rename_and_dtype_overwrite() -> None: df = pl.read_csv( f, new_columns=["A", "B", "C"], - dtypes={"A": pl.Utf8, "C": pl.Float32}, + dtypes={"A": pl.String, "C": pl.Float32}, has_header=False, ) - assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float32] + assert df.dtypes == [pl.String, pl.Int64, pl.Float32] def test_compressed_csv(io_files_path: Path) -> None: @@ -672,10 +672,10 @@ def test_ignore_try_parse_dates() -> None: headers = ["a", "b", "c"] dtypes: dict[str, type[pl.DataType]] = { - k: pl.Utf8 for k in headers - } # Forces Utf8 type for every column + k: pl.String for k in headers + } # Forces String type for every column df = pl.read_csv(csv, columns=headers, dtypes=dtypes) - assert df.dtypes == [pl.Utf8, pl.Utf8, pl.Utf8] + assert df.dtypes == [pl.String, pl.String, pl.String] def test_csv_date_handling() -> None: @@ -723,10 +723,10 @@ def test_csv_globbing(io_files_path: Path) -> None: assert df.row(0) == ("vegetables", 2) with pytest.raises(ValueError): - _ = pl.read_csv(path, dtypes=[pl.Utf8, pl.Int64, pl.Int64, pl.Int64]) + _ = pl.read_csv(path, dtypes=[pl.String, pl.Int64, pl.Int64, pl.Int64]) dtypes = { - "category": pl.Utf8, + "category": pl.String, "calories": pl.Int32, "fats_g": pl.Float32, "sugars_g": pl.Int32, @@ -752,29 +752,29 @@ def test_csv_schema_offset(foods_file_path: Path) -> None: df = pl.read_csv(csv, skip_rows=3) assert df.columns == ["alpha", "beta", "gamma"] assert df.shape == (3, 3) - assert df.dtypes == [pl.Int64, pl.Float64, pl.Utf8] + assert df.dtypes == [pl.Int64, pl.Float64, pl.String] df = pl.read_csv(csv, skip_rows=2, skip_rows_after_header=1) assert df.columns == ["col1", "col2", "col3"] assert df.shape == (3, 3) - assert df.dtypes == [pl.Int64, pl.Float64, pl.Utf8] + assert df.dtypes == [pl.Int64, pl.Float64, pl.String] df = pl.scan_csv(foods_file_path, skip_rows=4).collect() assert df.columns == ["fruit", "60", "0", "11"] assert df.shape == (23, 4) - assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float64, pl.Int64] + assert df.dtypes == [pl.String, pl.Int64, pl.Float64, pl.Int64] df = pl.scan_csv(foods_file_path, skip_rows_after_header=24).collect() assert df.columns == ["category", "calories", "fats_g", "sugars_g"] assert df.shape == (3, 4) - assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64, pl.Int64] + assert df.dtypes == [pl.String, pl.Int64, pl.Int64, pl.Int64] df = pl.scan_csv( foods_file_path, skip_rows_after_header=24, infer_schema_length=1 ).collect() assert df.columns == ["category", "calories", "fats_g", "sugars_g"] assert df.shape == (3, 4) - assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64, pl.Int64] + assert df.dtypes == [pl.String, pl.Int64, pl.Int64, pl.Int64] def test_empty_string_missing_round_trip() -> None: @@ -817,7 +817,7 @@ def test_escaped_null_values() -> None: df = pl.read_csv( f, null_values={"a": "None", "b": "n/a", "c": "NA"}, - dtypes={"a": pl.Utf8, "b": pl.Int64, "c": pl.Float64}, + dtypes={"a": pl.String, "b": pl.Int64, "c": pl.Float64}, ) assert df[1, "a"] is None assert df[0, "b"] is None @@ -1336,7 +1336,7 @@ def test_csv_single_categorical_null() -> None: dtypes={"y": pl.Categorical}, ) - assert df.dtypes == [pl.Utf8, pl.Categorical, pl.Utf8] + assert df.dtypes == [pl.String, pl.Categorical, pl.String] assert df.to_dict(as_series=False) == {"x": ["A"], "y": [None], "z": ["A"]} @@ -1638,7 +1638,7 @@ def test_provide_schema() -> None: assert pl.read_csv( io.StringIO("A\nB,ragged\nC"), has_header=False, - schema={"A": pl.Utf8, "B": pl.Utf8, "C": pl.Utf8}, + schema={"A": pl.String, "B": pl.String, "C": pl.String}, ).to_dict(as_series=False) == { "A": ["A", "B", "C"], "B": [None, "ragged", None], diff --git a/py-polars/tests/unit/io/test_database_read.py b/py-polars/tests/unit/io/test_database_read.py index 58f9f19a13bc..7b4ad1d8bc32 100644 --- a/py-polars/tests/unit/io/test_database_read.py +++ b/py-polars/tests/unit/io/test_database_read.py @@ -186,7 +186,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: # noqa: D102 connect_using="connectorx", expected_dtypes={ "id": pl.UInt8, - "name": pl.Utf8, + "name": pl.String, "value": pl.Float64, "date": pl.Date, }, @@ -205,9 +205,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: # noqa: D102 connect_using="adbc", expected_dtypes={ "id": pl.UInt8, - "name": pl.Utf8, + "name": pl.String, "value": pl.Float64, - "date": pl.Utf8, + "date": pl.String, }, expected_dates=["2020-01-01", "2021-12-31"], schema_overrides={"id": pl.UInt8}, @@ -224,7 +224,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: # noqa: D102 connect_using=lambda path: sqlite3.connect(path, detect_types=True), expected_dtypes={ "id": pl.UInt8, - "name": pl.Utf8, + "name": pl.String, "value": pl.Float32, "date": pl.Date, }, @@ -239,7 +239,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: # noqa: D102 connect_using=lambda path: sqlite3.connect(path, detect_types=True), expected_dtypes={ "id": pl.Int32, - "name": pl.Utf8, + "name": pl.String, "value": pl.Float32, "date": pl.Date, }, @@ -258,7 +258,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: # noqa: D102 ).connect(), expected_dtypes={ "id": pl.Int64, - "name": pl.Utf8, + "name": pl.String, "value": pl.Float64, "date": pl.Date, }, @@ -272,9 +272,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: # noqa: D102 connect_using=adbc_sqlite_connect, expected_dtypes={ "id": pl.Int64, - "name": pl.Utf8, + "name": pl.String, "value": pl.Float64, - "date": pl.Utf8, + "date": pl.String, }, expected_dates=["2020-01-01", "2021-12-31"], ), @@ -290,9 +290,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: # noqa: D102 connect_using=adbc_sqlite_connect, expected_dtypes={ "id": pl.Int64, - "name": pl.Utf8, + "name": pl.String, "value": pl.Float64, - "date": pl.Utf8, + "date": pl.String, }, expected_dates=["2020-01-01", "2021-12-31"], batch_size=1, diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py index 129dbbe8b2d6..18d2a8a3a72d 100644 --- a/py-polars/tests/unit/io/test_delta.py +++ b/py-polars/tests/unit/io/test_delta.py @@ -108,7 +108,7 @@ def test_read_delta_relative(delta_table_path: Path) -> None: @pytest.mark.write_disk() def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None: - v0 = df.select(pl.col(pl.Utf8)) + v0 = df.select(pl.col(pl.String)) v1 = df.select(pl.col(pl.Int64)) df_supported = df.drop(["cat", "time"]) @@ -188,7 +188,7 @@ def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None: @pytest.mark.parametrize( "series", [ - pl.Series("string", ["test"], dtype=pl.Utf8), + pl.Series("string", ["test"], dtype=pl.String), pl.Series("uint", [1], dtype=pl.UInt64), pl.Series("int", [1], dtype=pl.Int64), pl.Series( @@ -273,7 +273,7 @@ def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None: "date_range_nested", pl.List(pl.List(pl.Datetime(time_unit="ms", time_zone=None))), ), - pl.Field("string", pl.Utf8), + pl.Field("string", pl.String), pl.Field("int", pl.UInt32), ] ), @@ -319,7 +319,7 @@ def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None: pl.List(pl.Datetime(time_unit="ns", time_zone=None)) ), ), - pl.Field("string", pl.Utf8), + pl.Field("string", pl.String), pl.Field("int", pl.UInt32), ] ) diff --git a/py-polars/tests/unit/io/test_iceberg.py b/py-polars/tests/unit/io/test_iceberg.py index 3b40c9a71d94..eb90dbc053ee 100644 --- a/py-polars/tests/unit/io/test_iceberg.py +++ b/py-polars/tests/unit/io/test_iceberg.py @@ -35,7 +35,7 @@ def test_scan_iceberg_plain(iceberg_path: str) -> None: assert len(df.collect()) == 3 assert df.schema == { "id": pl.Int32, - "str": pl.Utf8, + "str": pl.String, "ts": pl.Datetime(time_unit="us", time_zone=None), } diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index d71dea5d0374..94c0a5dc2424 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -130,7 +130,7 @@ def test_ipc_schema(compression: IpcCompression) -> None: df.write_ipc(f, compression=compression) f.seek(0) - expected = {"a": pl.Int64(), "b": pl.Utf8(), "c": pl.Boolean()} + expected = {"a": pl.Int64(), "b": pl.String(), "c": pl.Boolean()} assert pl.read_ipc_schema(f) == expected @@ -158,8 +158,8 @@ def test_ipc_schema_from_file( "int_nulls": pl.Int64(), "floats": pl.Float64(), "floats_nulls": pl.Float64(), - "strings": pl.Utf8(), - "strings_nulls": pl.Utf8(), + "strings": pl.String(), + "strings_nulls": pl.String(), "date": pl.Date(), "datetime": pl.Datetime(), "time": pl.Time(), diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py index 5efeec63d20b..0b64b3290602 100644 --- a/py-polars/tests/unit/io/test_json.py +++ b/py-polars/tests/unit/io/test_json.py @@ -45,13 +45,13 @@ def test_to_from_buffer_arraywise_schema() -> None: ]""" ) - read_df = pl.read_json(buf, schema={"b": pl.Utf8, "e": pl.Int16}) + read_df = pl.read_json(buf, schema={"b": pl.String, "e": pl.Int16}) assert_frame_equal( read_df, pl.DataFrame( { - "b": pl.Series(["foo", None, "bar"], dtype=pl.Utf8), + "b": pl.Series(["foo", None, "bar"], dtype=pl.String), "e": pl.Series([None, None, None], dtype=pl.Int16), } ), @@ -75,7 +75,7 @@ def test_to_from_buffer_arraywise_schema_override() -> None: pl.DataFrame( { "a": pl.Series([5, 11.4, -25.8], dtype=pl.Float64), - "b": pl.Series(["foo", None, "bar"], dtype=pl.Utf8), + "b": pl.Series(["foo", None, "bar"], dtype=pl.String), "c": pl.Series([None, 1, 0], dtype=pl.Int64), "d": pl.Series([None, 8, None], dtype=pl.Float64), } @@ -158,7 +158,7 @@ def test_ndjson_nested_null() -> None: assert df.to_dict(as_series=False) == {"foo": [{"bar": []}]} -def test_ndjson_nested_utf8_int() -> None: +def test_ndjson_nested_string_int() -> None: ndjson = """{"Accumulables":[{"Value":32395888},{"Value":"539454"}]}""" assert pl.read_ndjson(io.StringIO(ndjson)).to_dict(as_series=False) == { "Accumulables": [[{"Value": "32395888"}, {"Value": "539454"}]] @@ -196,7 +196,7 @@ def test_json_sliced_list_serialization() -> None: def test_json_deserialize_empty_list_10458() -> None: - schema = {"LIST_OF_STRINGS": pl.List(pl.Utf8)} + schema = {"LIST_OF_STRINGS": pl.List(pl.String)} serialized_schema = pl.DataFrame(schema=schema).write_json() df = pl.read_json(io.StringIO(serialized_schema)) assert df.schema == schema @@ -248,7 +248,7 @@ def test_ndjson_ignore_errors() -> None: schema = { "Fields": pl.List( - pl.Struct([pl.Field("Name", pl.Utf8), pl.Field("Value", pl.Int64)]) + pl.Struct([pl.Field("Name", pl.String), pl.Field("Value", pl.Int64)]) ) } # schema argument only parses Fields diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index 1df38661b8c9..f20b8ea9d24e 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -82,10 +82,10 @@ def test_scan_csv_schema_overwrite_and_dtypes_overwrite( file_path = io_files_path / file_name df = pl.scan_csv( file_path, - dtypes={"calories_foo": pl.Utf8, "fats_g_foo": pl.Float32}, + dtypes={"calories_foo": pl.String, "fats_g_foo": pl.Float32}, with_column_names=lambda names: [f"{a}_foo" for a in names], ).collect() - assert df.dtypes == [pl.Utf8, pl.Utf8, pl.Float32, pl.Int64] + assert df.dtypes == [pl.String, pl.String, pl.Float32, pl.Int64] assert df.columns == [ "category_foo", "calories_foo", @@ -102,10 +102,10 @@ def test_scan_csv_schema_overwrite_and_small_dtypes_overwrite( file_path = io_files_path / file_name df = pl.scan_csv( file_path, - dtypes={"calories_foo": pl.Utf8, "sugars_g_foo": dtype}, + dtypes={"calories_foo": pl.String, "sugars_g_foo": dtype}, with_column_names=lambda names: [f"{a}_foo" for a in names], ).collect() - assert df.dtypes == [pl.Utf8, pl.Utf8, pl.Float64, dtype] + assert df.dtypes == [pl.String, pl.String, pl.Float64, dtype] assert df.columns == [ "category_foo", "calories_foo", @@ -124,16 +124,16 @@ def test_scan_csv_schema_new_columns_dtypes( # assign 'new_columns', providing partial dtype overrides df1 = pl.scan_csv( file_path, - dtypes={"calories": pl.Utf8, "sugars": dtype}, + dtypes={"calories": pl.String, "sugars": dtype}, new_columns=["category", "calories", "fats", "sugars"], ).collect() - assert df1.dtypes == [pl.Utf8, pl.Utf8, pl.Float64, dtype] + assert df1.dtypes == [pl.String, pl.String, pl.Float64, dtype] assert df1.columns == ["category", "calories", "fats", "sugars"] # assign 'new_columns' with 'dtypes' list df2 = pl.scan_csv( file_path, - dtypes=[pl.Utf8, pl.Utf8, pl.Float64, dtype], + dtypes=[pl.String, pl.String, pl.Float64, dtype], new_columns=["category", "calories", "fats", "sugars"], ).collect() assert df1.rows() == df2.rows() @@ -143,7 +143,7 @@ def test_scan_csv_schema_new_columns_dtypes( file_path, new_columns=["colw", "colx", "coly", "colz"], ) - assert df3.dtypes == [pl.Utf8, pl.Int64, pl.Float64, pl.Int64] + assert df3.dtypes == [pl.String, pl.Int64, pl.Float64, pl.Int64] assert df3.columns == ["colw", "colx", "coly", "colz"] assert ( df3.select(["colz", "colx"]).collect().rows() @@ -153,17 +153,17 @@ def test_scan_csv_schema_new_columns_dtypes( # partially rename columns / overwrite dtypes df4 = pl.scan_csv( file_path, - dtypes=[pl.Utf8, pl.Utf8], + dtypes=[pl.String, pl.String], new_columns=["category", "calories"], ).collect() - assert df4.dtypes == [pl.Utf8, pl.Utf8, pl.Float64, pl.Int64] + assert df4.dtypes == [pl.String, pl.String, pl.Float64, pl.Int64] assert df4.columns == ["category", "calories", "fats_g", "sugars_g"] # cannot have len(new_columns) > len(actual columns) with pytest.raises(pl.ShapeError): pl.scan_csv( file_path, - dtypes=[pl.Utf8, pl.Utf8], + dtypes=[pl.String, pl.String], new_columns=["category", "calories", "c3", "c4", "c5"], ).collect() @@ -171,7 +171,7 @@ def test_scan_csv_schema_new_columns_dtypes( with pytest.raises(ValueError, match="mutually.exclusive"): pl.scan_csv( file_path, - dtypes=[pl.Utf8, pl.Utf8], + dtypes=[pl.String, pl.String], new_columns=["category", "calories", "fats", "sugars"], with_column_names=lambda cols: [col.capitalize() for col in cols], ).collect() @@ -250,7 +250,7 @@ def test_scan_csv_schema_overwrite_not_projected_8483(foods_file_path: Path) -> df = ( pl.scan_csv( foods_file_path, - dtypes={"calories": pl.Utf8, "sugars_g": pl.Int8}, + dtypes={"calories": pl.String, "sugars_g": pl.Int8}, ) .select(pl.count()) .collect() @@ -284,4 +284,4 @@ def test_scan_empty_csv_with_row_count(tmp_path: Path) -> None: df.write_csv(file_path) read = pl.scan_csv(file_path).with_row_count("idx") - assert read.collect().schema == OrderedDict([("idx", pl.UInt32), ("a", pl.Utf8)]) + assert read.collect().schema == OrderedDict([("idx", pl.UInt32), ("a", pl.String)]) diff --git a/py-polars/tests/unit/io/test_lazy_ipc.py b/py-polars/tests/unit/io/test_lazy_ipc.py index 341ce0842caa..e12b0658a292 100644 --- a/py-polars/tests/unit/io/test_lazy_ipc.py +++ b/py-polars/tests/unit/io/test_lazy_ipc.py @@ -58,7 +58,7 @@ def test_row_count_schema(foods_ipc_path: Path) -> None: pl.scan_ipc(foods_ipc_path, row_count_name="id") .select(["id", "category"]) .collect() - ).dtypes == [pl.UInt32, pl.Utf8] + ).dtypes == [pl.UInt32, pl.String] def test_glob_n_rows(io_files_path: Path) -> None: diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index dcf77d9b29c4..ad2cf711b244 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -198,7 +198,7 @@ def test_row_count_schema_parquet(parquet_file_path: Path) -> None: pl.scan_parquet(str(parquet_file_path), row_count_name="id") .select(["id", "b"]) .collect() - ).dtypes == [pl.UInt32, pl.Utf8] + ).dtypes == [pl.UInt32, pl.String] @pytest.mark.write_disk() diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 832c475a6ac3..c2b66225622e 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -65,7 +65,7 @@ def test_write_parquet_using_pyarrow_write_to_dataset_with_partitioning( # cast is necessary as pyarrow writes partitions as categorical type read_df = pl.read_parquet(path_to_write, use_pyarrow=True).with_columns( - pl.col("partition_col").cast(pl.Utf8) + pl.col("partition_col").cast(pl.String) ) assert_frame_equal(df, read_df) diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 7087827a1790..c64505acbc8a 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -580,10 +580,10 @@ def test_excel_write_multiple_tables() -> None: from xlsxwriter import Workbook # note: checks that empty tables don't error on write - df1 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) - df2 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) - df3 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) - df4 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) + df1 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64}) + df2 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64}) + df3 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64}) + df4 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64}) xls = BytesIO() with Workbook(xls) as wb: @@ -619,9 +619,9 @@ def test_excel_freeze_panes() -> None: from xlsxwriter import Workbook # note: checks that empty tables don't error on write - df1 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) - df2 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) - df3 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.Utf8, "colz": pl.Float64}) + df1 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64}) + df2 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64}) + df3 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64}) xls = BytesIO() diff --git a/py-polars/tests/unit/namespaces/string/test_string.py b/py-polars/tests/unit/namespaces/string/test_string.py index 91b1595ece29..50b5fdd248ac 100644 --- a/py-polars/tests/unit/namespaces/string/test_string.py +++ b/py-polars/tests/unit/namespaces/string/test_string.py @@ -19,7 +19,7 @@ def test_str_concat() -> None: s = pl.Series(["1", None, "2", None]) # propagate null assert_series_equal( - s.str.concat(ignore_nulls=False), pl.Series([None], dtype=pl.Utf8) + s.str.concat(ignore_nulls=False), pl.Series([None], dtype=pl.String) ) # ignore null assert_series_equal(s.str.concat(), pl.Series(["1-2"])) @@ -41,21 +41,21 @@ def test_str_concat2() -> None: def test_str_concat_all_null() -> None: - s = pl.Series([None, None, None], dtype=pl.Utf8) + s = pl.Series([None, None, None], dtype=pl.String) assert_series_equal( - s.str.concat(ignore_nulls=False), pl.Series([None], dtype=pl.Utf8) + s.str.concat(ignore_nulls=False), pl.Series([None], dtype=pl.String) ) assert_series_equal(s.str.concat(ignore_nulls=True), pl.Series([""])) def test_str_concat_empty_list() -> None: - s = pl.Series([], dtype=pl.Utf8) + s = pl.Series([], dtype=pl.String) assert_series_equal(s.str.concat(ignore_nulls=False), pl.Series([""])) assert_series_equal(s.str.concat(ignore_nulls=True), pl.Series([""])) def test_str_concat_empty_list2() -> None: - s = pl.Series([], dtype=pl.Utf8) + s = pl.Series([], dtype=pl.String) df = pl.DataFrame({"foo": s}) result = df.select(pl.col("foo").str.concat()).item() expected = "" @@ -63,7 +63,7 @@ def test_str_concat_empty_list2() -> None: def test_str_concat_empty_list_agg_context() -> None: - df = pl.DataFrame(data={"i": [1], "v": [None]}, schema_overrides={"v": pl.Utf8}) + df = pl.DataFrame(data={"i": [1], "v": [None]}, schema_overrides={"v": pl.String}) result = df.group_by("i").agg(pl.col("v").drop_nulls().str.concat())["v"].item() expected = "" assert result == expected @@ -387,8 +387,8 @@ def test_str_strip_prefix_literal() -> None: expected = pl.Series([":bar", "foo:bar", "bar:bar", "", "", None]) assert_series_equal(s.str.strip_prefix("foo"), expected) # test null literal - expected = pl.Series([None, None, None, None, None, None], dtype=pl.Utf8) - assert_series_equal(s.str.strip_prefix(pl.lit(None, dtype=pl.Utf8)), expected) + expected = pl.Series([None, None, None, None, None, None], dtype=pl.String) + assert_series_equal(s.str.strip_prefix(pl.lit(None, dtype=pl.String)), expected) def test_str_strip_prefix_suffix_expr() -> None: @@ -414,8 +414,8 @@ def test_str_strip_suffix() -> None: expected = pl.Series(["foo:", "foo:bar", "foo:foo", "", "", None]) assert_series_equal(s.str.strip_suffix("bar"), expected) # test null literal - expected = pl.Series([None, None, None, None, None, None], dtype=pl.Utf8) - assert_series_equal(s.str.strip_suffix(pl.lit(None, dtype=pl.Utf8)), expected) + expected = pl.Series([None, None, None, None, None, None], dtype=pl.String) + assert_series_equal(s.str.strip_suffix(pl.lit(None, dtype=pl.String)), expected) def test_str_split() -> None: @@ -451,7 +451,7 @@ def test_json_decode_series() -> None: dtype2 = pl.Struct([pl.Field("a", pl.Int64)]) assert_series_equal(s.str.json_decode(dtype2), expected) - s = pl.Series([], dtype=pl.Utf8) + s = pl.Series([], dtype=pl.String) expected = pl.Series([], dtype=pl.List(pl.Int64)) dtype = pl.List(pl.Int64) assert_series_equal(s.str.json_decode(dtype), expected) @@ -485,9 +485,9 @@ def test_json_decode_nested_struct() -> None: expected_dtype = pl.List( pl.Struct( [ - pl.Field("key_1", pl.Utf8), + pl.Field("key_1", pl.String), pl.Field("key_2", pl.Int64), - pl.Field("key_3", pl.Utf8), + pl.Field("key_3", pl.String), ] ) ) @@ -522,8 +522,8 @@ def test_json_decode_primitive_to_list_11053() -> None: ) schema = pl.Struct( { - "col1": pl.List(pl.Utf8), - "col2": pl.List(pl.Utf8), + "col1": pl.List(pl.String), + "col2": pl.List(pl.String), } ) @@ -567,7 +567,7 @@ def test_str_concat_returns_scalar() -> None: .agg(pl.col("val").str.concat(delimiter=",").alias("grouped")) .get_column("grouped") ) - assert grouped.dtype == pl.Utf8 + assert grouped.dtype == pl.String def test_contains() -> None: @@ -843,7 +843,7 @@ def test_extract_all_many() -> None: "a": [["a"], ["a"], ["a"], [], None, []], "null": [None] * 6, } - assert broad.schema == {"a": pl.List(pl.Utf8), "null": pl.List(pl.Utf8)} + assert broad.schema == {"a": pl.List(pl.String), "null": pl.List(pl.String)} def test_extract_groups() -> None: @@ -1036,7 +1036,7 @@ def test_split_exact() -> None: { "field_0": ["a", None, "b", "c"], "field_1": ["a", None, None, "c"], - "field_2": pl.Series([None, None, None, None], dtype=pl.Utf8), + "field_2": pl.Series([None, None, None, None], dtype=pl.String), } ) @@ -1067,7 +1067,7 @@ def test_split_exact_expr() -> None: { "field_0": ["a", None, "b", "c", None], "field_1": ["a", None, None, "c", None], - "field_2": pl.Series([None, None, None, "c", None], dtype=pl.Utf8), + "field_2": pl.Series([None, None, None, "c", None], dtype=pl.String), } ) @@ -1081,7 +1081,7 @@ def test_split_exact_expr() -> None: { "field_0": ["a_", None, "b", "c^", None], "field_1": ["a", None, None, "c^", None], - "field_2": pl.Series([None, None, None, "c", None], dtype=pl.Utf8), + "field_2": pl.Series([None, None, None, "c", None], dtype=pl.String), } ) assert_frame_equal(out2, expected2) @@ -1167,7 +1167,7 @@ def test_string_extract_groups_lazy_schema_10305() -> None: "captures" ) - assert df.schema == {"candidate": pl.Utf8, "ref": pl.Utf8} + assert df.schema == {"candidate": pl.String, "ref": pl.String} def test_string_reverse() -> None: @@ -1181,7 +1181,7 @@ def test_string_reverse() -> None: pl.Series( "text", [None, "oof", "rab", "#&azzip ekil i", None, "anan\u0303am"], - dtype=pl.Utf8, + dtype=pl.String, ), ] ) diff --git a/py-polars/tests/unit/namespaces/test_binary.py b/py-polars/tests/unit/namespaces/test_binary.py index 7e6eba929c15..79dfd6c7e1ca 100644 --- a/py-polars/tests/unit/namespaces/test_binary.py +++ b/py-polars/tests/unit/namespaces/test_binary.py @@ -7,7 +7,7 @@ def test_binary_conversions() -> None: df = pl.DataFrame({"blob": [b"abc", None, b"cde"]}).with_columns( - pl.col("blob").cast(pl.Utf8).alias("decoded_blob") + pl.col("blob").cast(pl.String).alias("decoded_blob") ) assert df.to_dict(as_series=False) == { @@ -16,7 +16,7 @@ def test_binary_conversions() -> None: } assert df[0, 0] == b"abc" assert df[1, 0] is None - assert df.dtypes == [pl.Binary, pl.Utf8] + assert df.dtypes == [pl.Binary, pl.String] def test_contains() -> None: diff --git a/py-polars/tests/unit/namespaces/test_categorical.py b/py-polars/tests/unit/namespaces/test_categorical.py index f010434375ee..03ca6497eb0d 100644 --- a/py-polars/tests/unit/namespaces/test_categorical.py +++ b/py-polars/tests/unit/namespaces/test_categorical.py @@ -16,21 +16,21 @@ def test_categorical_lexical_sort() -> None: expected = pl.DataFrame( {"cats": ["a", "b", "k", "z", "z"], "vals": [2, 3, 2, 3, 1]} ) - assert_frame_equal(out.with_columns(pl.col("cats").cast(pl.Utf8)), expected) + assert_frame_equal(out.with_columns(pl.col("cats").cast(pl.String)), expected) out = df.sort(["cats", "vals"]) expected = pl.DataFrame( {"cats": ["a", "b", "k", "z", "z"], "vals": [2, 3, 2, 1, 3]} ) - assert_frame_equal(out.with_columns(pl.col("cats").cast(pl.Utf8)), expected) + assert_frame_equal(out.with_columns(pl.col("cats").cast(pl.String)), expected) out = df.sort(["vals", "cats"]) expected = pl.DataFrame( {"cats": ["z", "a", "k", "b", "z"], "vals": [1, 2, 2, 3, 3]} ) - assert_frame_equal(out.with_columns(pl.col("cats").cast(pl.Utf8)), expected) + assert_frame_equal(out.with_columns(pl.col("cats").cast(pl.String)), expected) s = pl.Series(["a", "c", "a", "b", "a"], dtype=pl.Categorical("lexical")) - assert s.sort().cast(pl.Utf8).to_list() == [ + assert s.sort().cast(pl.String).to_list() == [ "a", "a", "a", diff --git a/py-polars/tests/unit/namespaces/test_list.py b/py-polars/tests/unit/namespaces/test_list.py index f4733e16cea5..f3de748386c3 100644 --- a/py-polars/tests/unit/namespaces/test_list.py +++ b/py-polars/tests/unit/namespaces/test_list.py @@ -428,7 +428,7 @@ def test_list_gather() -> None: def test_list_eval_all_null() -> None: df = pl.DataFrame({"foo": [1, 2, 3], "bar": [None, None, None]}).with_columns( - pl.col("bar").cast(pl.List(pl.Utf8)) + pl.col("bar").cast(pl.List(pl.String)) ) assert df.select(pl.col("bar").list.eval(pl.element())).to_dict( diff --git a/py-polars/tests/unit/namespaces/test_name.py b/py-polars/tests/unit/namespaces/test_name.py index 3cfdae9c3fd8..cf08b7154c22 100644 --- a/py-polars/tests/unit/namespaces/test_name.py +++ b/py-polars/tests/unit/namespaces/test_name.py @@ -7,7 +7,7 @@ def test_name_change_case() -> None: df = pl.DataFrame( - schema={"ColX": pl.Int32, "ColY": pl.Utf8}, + schema={"ColX": pl.Int32, "ColY": pl.String}, ).with_columns( pl.all().name.to_uppercase(), pl.all().name.to_lowercase(), @@ -15,18 +15,18 @@ def test_name_change_case() -> None: assert df.schema == OrderedDict( [ ("ColX", pl.Int32), - ("ColY", pl.Utf8), + ("ColY", pl.String), ("COLX", pl.Int32), - ("COLY", pl.Utf8), + ("COLY", pl.String), ("colx", pl.Int32), - ("coly", pl.Utf8), + ("coly", pl.String), ] ) def test_name_prefix_suffix() -> None: df = pl.DataFrame( - schema={"ColX": pl.Int32, "ColY": pl.Utf8}, + schema={"ColX": pl.Int32, "ColY": pl.String}, ).with_columns( pl.all().name.prefix("#"), pl.all().name.suffix("!!"), @@ -34,11 +34,11 @@ def test_name_prefix_suffix() -> None: assert df.schema == OrderedDict( [ ("ColX", pl.Int32), - ("ColY", pl.Utf8), + ("ColY", pl.String), ("#ColX", pl.Int32), - ("#ColY", pl.Utf8), + ("#ColY", pl.String), ("ColX!!", pl.Int32), - ("ColY!!", pl.Utf8), + ("ColY!!", pl.String), ] ) diff --git a/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py b/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py index 9ff8e714111d..d78ffd30cf1f 100644 --- a/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py +++ b/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py @@ -129,7 +129,7 @@ # --------------------------------------------- # string expr: case/cast ops # --------------------------------------------- - ("b", "lambda x: str(x).title()", 'pl.col("b").cast(pl.Utf8).str.to_titlecase()'), + ("b", "lambda x: str(x).title()", 'pl.col("b").cast(pl.String).str.to_titlecase()'), ( "b", 'lambda x: x.lower() + ":" + x.upper() + ":" + x.title()', @@ -294,7 +294,7 @@ def test_parse_apply_raw_functions() -> None: assert_frame_equal(*result_frames) # test primitive python casts - for py_cast, pl_dtype in ((str, pl.Utf8), (int, pl.Int64), (float, pl.Float64)): + for py_cast, pl_dtype in ((str, pl.String), (int, pl.Int64), (float, pl.Float64)): with pytest.warns( PolarsInefficientMapWarning, match=rf'(?s)with this one instead.*pl\.col\("a"\)\.cast\(pl\.{pl_dtype.__name__}\)', @@ -351,7 +351,7 @@ def x10(self, x: pl.Expr) -> pl.Expr: ( [1, 2, 3], lambda x: str(x), - "s.cast(pl.Utf8)", + "s.cast(pl.String)", ), ( [-20, -12, -5, 0, 5, 12, 20], diff --git a/py-polars/tests/unit/operations/map/test_map_groups.py b/py-polars/tests/unit/operations/map/test_map_groups.py index 90699300ea9d..315aadf50e57 100644 --- a/py-polars/tests/unit/operations/map/test_map_groups.py +++ b/py-polars/tests/unit/operations/map/test_map_groups.py @@ -127,7 +127,7 @@ def __init__(self, payload: Any): ) ) - assert result.dtypes == [pl.Utf8, pl.Object] + assert result.dtypes == [pl.String, pl.Object] def test_map_groups_numpy_output_3057() -> None: diff --git a/py-polars/tests/unit/operations/test_cast.py b/py-polars/tests/unit/operations/test_cast.py index 6f641e36139c..6cb20bce014a 100644 --- a/py-polars/tests/unit/operations/test_cast.py +++ b/py-polars/tests/unit/operations/test_cast.py @@ -16,7 +16,7 @@ ) -def test_utf8_date() -> None: +def test_string_date() -> None: df = pl.DataFrame({"x1": ["2021-01-01"]}).with_columns( **{"x1-date": pl.col("x1").cast(pl.Date)} ) @@ -25,14 +25,14 @@ def test_utf8_date() -> None: assert_frame_equal(expected, out) -def test_invalid_utf8_date() -> None: +def test_invalid_string_date() -> None: df = pl.DataFrame({"x1": ["2021-01-aa"]}) with pytest.raises(ComputeError): df.with_columns(**{"x1-date": pl.col("x1").cast(pl.Date)}) -def test_utf8_datetime() -> None: +def test_string_datetime() -> None: df = pl.DataFrame( {"x1": ["2021-12-19T00:39:57", "2022-12-19T16:39:57"]} ).with_columns( @@ -62,7 +62,7 @@ def test_utf8_datetime() -> None: assert_frame_equal(expected, out) -def test_invalid_utf8_datetime() -> None: +def test_invalid_string_datetime() -> None: df = pl.DataFrame({"x1": ["2021-12-19 00:39:57", "2022-12-19 16:39:57"]}) with pytest.raises(ComputeError): df.with_columns( @@ -70,7 +70,7 @@ def test_invalid_utf8_datetime() -> None: ) -def test_utf8_datetime_timezone() -> None: +def test_string_datetime_timezone() -> None: ccs_tz = "America/Caracas" stg_tz = "America/Santiago" utc_tz = "UTC" diff --git a/py-polars/tests/unit/operations/test_comparison.py b/py-polars/tests/unit/operations/test_comparison.py index be54576a5cbf..3980048d30f0 100644 --- a/py-polars/tests/unit/operations/test_comparison.py +++ b/py-polars/tests/unit/operations/test_comparison.py @@ -37,14 +37,14 @@ def test_comparison_order_null_broadcasting() -> None: def test_comparison_nulls_single() -> None: df1 = pl.DataFrame( { - "a": pl.Series([None], dtype=pl.Utf8), + "a": pl.Series([None], dtype=pl.String), "b": pl.Series([None], dtype=pl.Int64), "c": pl.Series([None], dtype=pl.Boolean), } ) df2 = pl.DataFrame( { - "a": pl.Series([None], dtype=pl.Utf8), + "a": pl.Series([None], dtype=pl.String), "b": pl.Series([None], dtype=pl.Int64), "c": pl.Series([None], dtype=pl.Boolean), } @@ -330,12 +330,12 @@ def test_total_ordering_float_series(lhs: float | None, rhs: float | None) -> No @pytest.mark.parametrize("lhs", INTERESTING_STRING_VALUES) @pytest.mark.parametrize("rhs", INTERESTING_STRING_VALUES) def test_total_ordering_string_series(lhs: str | None, rhs: str | None) -> None: - verify_total_ordering(lhs, rhs, "", pl.Utf8) + verify_total_ordering(lhs, rhs, "", pl.String) context: pytest.WarningsRecorder | ContextManager[None] = ( pytest.warns(UserWarning) if rhs is None else nullcontext() ) with context: - verify_total_ordering_broadcast(lhs, rhs, "", pl.Utf8) + verify_total_ordering_broadcast(lhs, rhs, "", pl.String) @pytest.mark.parametrize("str_lhs", INTERESTING_STRING_VALUES) diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py index d5c501f70803..086b5a1cf8d2 100644 --- a/py-polars/tests/unit/operations/test_explode.py +++ b/py-polars/tests/unit/operations/test_explode.py @@ -145,7 +145,7 @@ def test_sliced_null_explode() -> None: assert s.slice(2, 4).list.explode().to_list() == [True, False, None, True] -def test_utf8_explode() -> None: +def test_string_explode() -> None: assert pl.Series(["foobar", None]).str.explode().to_list() == [ "f", "o", @@ -270,7 +270,7 @@ def test_explode_binary() -> None: def test_explode_null_list() -> None: - assert pl.Series([["a"], None], dtype=pl.List(pl.Utf8))[ + assert pl.Series([["a"], None], dtype=pl.List(pl.String))[ 1:2 ].list.min().to_list() == [None] @@ -320,7 +320,7 @@ def test_explode_array() -> None: assert_frame_equal(out, expected) -def test_utf8_list_agg_explode() -> None: +def test_string_list_agg_explode() -> None: df = pl.DataFrame({"a": [[None], ["b"]]}) df = df.select( diff --git a/py-polars/tests/unit/operations/test_filter.py b/py-polars/tests/unit/operations/test_filter.py index 93cd542cb5dd..1eb1a25d5bc4 100644 --- a/py-polars/tests/unit/operations/test_filter.py +++ b/py-polars/tests/unit/operations/test_filter.py @@ -81,7 +81,7 @@ def test_filter_is_in_4572() -> None: @pytest.mark.parametrize( - "dtype", [pl.Int32, pl.Boolean, pl.Utf8, pl.Binary, pl.List(pl.Int64), pl.Object] + "dtype", [pl.Int32, pl.Boolean, pl.String, pl.Binary, pl.List(pl.Int64), pl.Object] ) def test_filter_on_empty(dtype: PolarsDataType) -> None: df = pl.DataFrame({"a": []}, schema={"a": dtype}) @@ -195,7 +195,7 @@ def test_agg_function_of_filter_10565() -> None: as_series=False ) == {"a": []} - df_str = pl.DataFrame(data={"a": []}, schema={"a": pl.Utf8}) + df_str = pl.DataFrame(data={"a": []}, schema={"a": pl.String}) assert df_str.filter(pl.col("a").n_unique().over("a") == 1).to_dict( as_series=False ) == {"a": []} diff --git a/py-polars/tests/unit/operations/test_is_first_last_distinct.py b/py-polars/tests/unit/operations/test_is_first_last_distinct.py index 45c8d9c66956..ad5471961b93 100644 --- a/py-polars/tests/unit/operations/test_is_first_last_distinct.py +++ b/py-polars/tests/unit/operations/test_is_first_last_distinct.py @@ -112,7 +112,7 @@ def test_is_last_distinct() -> None: assert s.is_last_distinct().to_list() == expected -@pytest.mark.parametrize("dtypes", [pl.Int32, pl.Utf8, pl.Boolean, pl.List(pl.Int32)]) +@pytest.mark.parametrize("dtypes", [pl.Int32, pl.String, pl.Boolean, pl.List(pl.Int32)]) def test_is_first_last_distinct_all_null(dtypes: pl.PolarsDataType) -> None: s = pl.Series([None, None, None], dtype=dtypes) assert s.is_first_distinct().to_list() == [True, False, False] diff --git a/py-polars/tests/unit/operations/test_is_in.py b/py-polars/tests/unit/operations/test_is_in.py index 967dcf19dd51..98cc993fc13c 100644 --- a/py-polars/tests/unit/operations/test_is_in.py +++ b/py-polars/tests/unit/operations/test_is_in.py @@ -124,7 +124,7 @@ def test_is_in_series() -> None: out = s.is_in(["a", "b"]) assert out.to_list() == [True, True, False] - # Check if empty list is converted to pl.Utf8. + # Check if empty list is converted to pl.String out = s.is_in([]) assert out.to_list() == [False] * out.len() @@ -252,7 +252,7 @@ def test_cat_is_in_series(dtype: pl.DataType) -> None: expected = pl.Series([False, True, True, None]) assert_series_equal(s.is_in(s2), expected) - s2_str = s2.cast(pl.Utf8) + s2_str = s2.cast(pl.String) assert_series_equal(s.is_in(s2_str), expected) @@ -264,7 +264,7 @@ def test_cat_is_in_series_non_existent() -> None: expected = pl.Series([True, False, False, None]) assert_series_equal(s.is_in(s2), expected) - s2_str = s2.cast(pl.Utf8) + s2_str = s2.cast(pl.String) assert_series_equal(s.is_in(s2_str), expected) diff --git a/py-polars/tests/unit/operations/test_join_asof.py b/py-polars/tests/unit/operations/test_join_asof.py index eae24dfc2ab4..9a2a38965c03 100644 --- a/py-polars/tests/unit/operations/test_join_asof.py +++ b/py-polars/tests/unit/operations/test_join_asof.py @@ -429,7 +429,7 @@ def test_asof_join_sorted_by_group(capsys: Any) -> None: expected = pl.DataFrame( [ - pl.Series("key", ["a", "a", "a", "b", "b", "b"], dtype=pl.Utf8), + pl.Series("key", ["a", "a", "a", "b", "b", "b"], dtype=pl.String), pl.Series("asof_key", [1.0, 2.0, 3.0, 1.0, 2.0, 3.0], dtype=pl.Float64), pl.Series("a", [101, 102, 103, 104, 105, 106], dtype=pl.Int64), pl.Series("b", [201, 202, 203, 204, 205, 206], dtype=pl.Int64), diff --git a/py-polars/tests/unit/operations/test_replace.py b/py-polars/tests/unit/operations/test_replace.py index fdfadca8d5ea..e178d7ef3f44 100644 --- a/py-polars/tests/unit/operations/test_replace.py +++ b/py-polars/tests/unit/operations/test_replace.py @@ -108,7 +108,7 @@ def test_replace_enum_to_str() -> None: result = s.replace({"a": "c", "b": "d"}) - expected = pl.Series(["c", "d", "c"], dtype=pl.Utf8) + expected = pl.Series(["c", "d", "c"], dtype=pl.String) assert_series_equal(result, expected) @@ -323,7 +323,7 @@ def test_replace_mix() -> None: [ pl.Series("float_to_boolean", [True, None], dtype=pl.Boolean), pl.Series("boolean_to_int", [1, 0], dtype=pl.Int64), - pl.Series("boolean_to_str", ["1", "0"], dtype=pl.Utf8), + pl.Series("boolean_to_str", ["1", "0"], dtype=pl.String), ] ) assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/operations/test_unique.py b/py-polars/tests/unit/operations/test_unique.py index 664d04ba1f34..8d5bc3594a40 100644 --- a/py-polars/tests/unit/operations/test_unique.py +++ b/py-polars/tests/unit/operations/test_unique.py @@ -16,7 +16,7 @@ def test_unique_predicate_pd() -> None: .filter(pl.col("z")) .collect() ) - expected = pl.DataFrame(schema={"x": pl.Utf8, "y": pl.Utf8, "z": pl.Boolean}) + expected = pl.DataFrame(schema={"x": pl.String, "y": pl.String, "z": pl.Boolean}) assert_frame_equal(result, expected) result = ( diff --git a/py-polars/tests/unit/operations/test_value_counts.py b/py-polars/tests/unit/operations/test_value_counts.py index 60329cdb9238..b38992b5146a 100644 --- a/py-polars/tests/unit/operations/test_value_counts.py +++ b/py-polars/tests/unit/operations/test_value_counts.py @@ -23,7 +23,7 @@ def test_value_counts_logical_type() -> None: ) out = df.select(pl.all().value_counts()) assert out["ac"].struct.field("ac").dtype == pl.Categorical - assert out["a"].struct.field("a").dtype == pl.Utf8 + assert out["a"].struct.field("a").dtype == pl.String def test_value_counts_expr() -> None: diff --git a/py-polars/tests/unit/operations/test_window.py b/py-polars/tests/unit/operations/test_window.py index 35c566ada53f..38d4e258c57d 100644 --- a/py-polars/tests/unit/operations/test_window.py +++ b/py-polars/tests/unit/operations/test_window.py @@ -330,7 +330,7 @@ def test_window_function_implode_contention_8536() -> None: "policy": ["a", "b", "c", "c", "d", "d", "d", "d", "e", "e"], "memo": ["LE", "RM", "", "", "", "LE", "", "", "", "RM"], }, - schema={"policy": pl.Utf8, "memo": pl.Utf8}, + schema={"policy": pl.String, "memo": pl.String}, ) assert df.select( diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index 468c42d761ec..74f93f5644e0 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -76,20 +76,20 @@ def test_init_inputs(monkeypatch: Any) -> None: assert pl.Series("a").dtype == pl.Null # Null dtype used in case of no data assert pl.Series().dtype == pl.Null assert pl.Series([]).dtype == pl.Null - assert pl.Series(dtype_if_empty=pl.Utf8).dtype == pl.Utf8 + assert pl.Series(dtype_if_empty=pl.String).dtype == pl.String assert pl.Series([], dtype_if_empty=pl.UInt16).dtype == pl.UInt16 assert ( pl.Series([None, None, None]).dtype == pl.Null ) # f32 type used for list with only None assert pl.Series([None, None, None], dtype_if_empty=pl.Int8).dtype == pl.Int8 - # note: "== []" will be cast to empty Series with Utf8 dtype. + # note: "== []" will be cast to empty Series with String dtype. assert_series_equal( - pl.Series([], dtype_if_empty=pl.Utf8) == [], pl.Series("", dtype=pl.Boolean) + pl.Series([], dtype_if_empty=pl.String) == [], pl.Series("", dtype=pl.Boolean) ) assert pl.Series(values=[True, False]).dtype == pl.Boolean assert pl.Series(values=np.array([True, False])).dtype == pl.Boolean - assert pl.Series(values=np.array(["foo", "bar"])).dtype == pl.Utf8 - assert pl.Series(values=["foo", "bar"]).dtype == pl.Utf8 + assert pl.Series(values=np.array(["foo", "bar"])).dtype == pl.String + assert pl.Series(values=["foo", "bar"]).dtype == pl.String assert pl.Series("a", [pl.Series([1, 2, 4]), pl.Series([3, 2, 1])]).dtype == pl.List assert pl.Series("a", [10000, 20000, 30000], dtype=pl.Time).dtype == pl.Time @@ -133,7 +133,7 @@ def test_init_inputs(monkeypatch: Any) -> None: assert pl.Series( values=np.array([["foo", "bar"], ["foo2", "bar2"]]) - ).dtype == pl.List(pl.Utf8) + ).dtype == pl.List(pl.String) # lists assert pl.Series("a", [[1, 2], [3, 4]]).dtype == pl.List(pl.Int64) @@ -222,9 +222,9 @@ class TeaShipmentPD(pydantic.BaseModel): assert isinstance(s, pl.Series) assert s.dtype.fields == [ # type: ignore[attr-defined] - Field("exporter", pl.Utf8), - Field("importer", pl.Utf8), - Field("product", pl.Utf8), + Field("exporter", pl.String), + Field("importer", pl.String), + Field("product", pl.String), Field("tonnes", pl.Int64), ] assert s.to_list() == [ @@ -260,7 +260,7 @@ def test_concat() -> None: @pytest.mark.parametrize( "dtype", - [pl.Int64, pl.Float64, pl.Utf8, pl.Boolean], + [pl.Int64, pl.Float64, pl.String, pl.Boolean], ) def test_eq_missing_list_and_primitive(dtype: PolarsDataType) -> None: s1 = pl.Series([None, None], dtype=dtype) @@ -507,7 +507,7 @@ def test_add_string() -> None: [ (100, pl.Int64), (8.5, pl.Float64), - ("서울특별시", pl.Utf8), + ("서울특별시", pl.String), (date.today(), pl.Date), (datetime.now(), pl.Datetime("us")), (time(23, 59, 59), pl.Time), @@ -894,9 +894,9 @@ def test_ufunc() -> None: def test_numpy_string_array() -> None: - s_utf8 = pl.Series("a", ["aa", "bb", "cc", "dd"], dtype=pl.Utf8) + s_str = pl.Series("a", ["aa", "bb", "cc", "dd"], dtype=pl.String) assert_array_equal( - np.char.capitalize(s_utf8), + np.char.capitalize(s_str), np.array(["Aa", "Bb", "Cc", "Dd"], dtype=" None: pl.Series("i64", [1, 2, None], dtype=pl.Int64), pl.Series("f32", [1, 2, None], dtype=pl.Float32), pl.Series("cat", ["a", "b", None], dtype=pl.Categorical), - pl.Series("str", ["a", "b", None], dtype=pl.Utf8), + pl.Series("str", ["a", "b", None], dtype=pl.String), pl.Series("bool", [True, True, None], dtype=pl.Boolean), ] ) @@ -1093,12 +1093,12 @@ def test_fill_null() -> None: assert out.dtypes == [pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64] -def test_utf8_series_min_max_10674() -> None: - utf8_series = pl.Series("b", ["a", None, "c", None, "e"], dtype=pl.Utf8) - assert utf8_series.min() == "a" - assert utf8_series.max() == "e" - assert utf8_series.sort(descending=False).min() == "a" - assert utf8_series.sort(descending=True).max() == "e" +def test_str_series_min_max_10674() -> None: + str_series = pl.Series("b", ["a", None, "c", None, "e"], dtype=pl.String) + assert str_series.min() == "a" + assert str_series.max() == "e" + assert str_series.sort(descending=False).min() == "a" + assert str_series.sort(descending=True).max() == "e" def test_fill_nan() -> None: @@ -1790,18 +1790,18 @@ def test_arg_min_and_arg_max() -> None: assert s.arg_min() is None assert s.arg_max() is None - # utf8 no null + # str no null s = pl.Series(["a", "c", "b"]) assert s.arg_min() == 0 assert s.arg_max() == 1 - # utf8 has null + # str has null s = pl.Series([None, "a", None, "b"]) assert s.arg_min() == 1 assert s.arg_max() == 3 - # utf8 all null - s = pl.Series([None, None], dtype=pl.Utf8) + # str all null + s = pl.Series([None, None], dtype=pl.String) assert s.arg_min() is None assert s.arg_max() is None @@ -1829,7 +1829,7 @@ def test_arg_min_and_arg_max() -> None: assert s.arg_min() == 5 assert s.arg_max() == 1 - # test ascending and descending utf8 series + # test ascending and descending str series s = pl.Series([None, "a", "b", "c", "d", "e"]) s.sort(in_place=True) # set ascending sorted flag assert s.flags == {"SORTED_ASC": True, "SORTED_DESC": False} @@ -1851,8 +1851,8 @@ def test_arg_min_and_arg_max() -> None: assert s.arg_min() is None assert s.arg_max() is None - # test utf8 empty series - s = pl.Series([], dtype=pl.Utf8) + # test str empty series + s = pl.Series([], dtype=pl.String) assert s.arg_min() is None assert s.arg_max() is None @@ -1879,7 +1879,7 @@ def test_is_unique() -> None: s = pl.Series("a", [1, 2, 2, 3]) assert_series_equal(s.is_unique(), pl.Series("a", [True, False, False, True])) - # utf8 + # str assert pl.Series(["a", "b", "c", "a"]).is_duplicated().to_list() == [ True, False, @@ -1990,7 +1990,7 @@ def test_reshape() -> None: def test_init_categorical() -> None: with pl.StringCache(): for values in [[None], ["foo", "bar"], [None, "foo", "bar"]]: - expected = pl.Series("a", values, dtype=pl.Utf8).cast(pl.Categorical) + expected = pl.Series("a", values, dtype=pl.String).cast(pl.Categorical) a = pl.Series("a", values, dtype=pl.Categorical) assert_series_equal(a, expected) @@ -2024,7 +2024,7 @@ def test_iter_nested_struct() -> None: pl.Float32, pl.Int32, pl.Boolean, - pl.List(pl.Utf8), + pl.List(pl.String), pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)]), ], ) @@ -2308,7 +2308,7 @@ def test_ewm_param_validation() -> None: (4, pl.UInt32), (4.5, pl.Float32), (None, pl.Float64), - ("白鵬翔", pl.Utf8), + ("白鵬翔", pl.String), (date.today(), pl.Date), (datetime.now(), pl.Datetime("ns")), (time(23, 59, 59), pl.Time), @@ -2720,10 +2720,10 @@ def test_series_cmp_fast_paths() -> None: ).to_list() == [None, None] assert ( - pl.Series([None], dtype=pl.Utf8) != pl.Series(["a", "b"], dtype=pl.Utf8) + pl.Series([None], dtype=pl.String) != pl.Series(["a", "b"], dtype=pl.String) ).to_list() == [None, None] assert ( - pl.Series([None], dtype=pl.Utf8) == pl.Series(["a", "b"], dtype=pl.Utf8) + pl.Series([None], dtype=pl.String) == pl.Series(["a", "b"], dtype=pl.String) ).to_list() == [None, None] assert ( diff --git a/py-polars/tests/unit/series/test_to_numpy.py b/py-polars/tests/unit/series/test_to_numpy.py index f842a807a49a..e245009e7171 100644 --- a/py-polars/tests/unit/series/test_to_numpy.py +++ b/py-polars/tests/unit/series/test_to_numpy.py @@ -14,7 +14,7 @@ ).filter( lambda s: ( getattr(s.dtype, "time_unit", None) != "ms" - and not (s.dtype == pl.Utf8 and s.str.contains("\x00").any()) + and not (s.dtype == pl.String and s.str.contains("\x00").any()) and not (s.dtype == pl.Binary and s.bin.contains(b"\x00").any()) ) ), diff --git a/py-polars/tests/unit/sql/test_sql.py b/py-polars/tests/unit/sql/test_sql.py index 360f1cd8738a..54835f78a84c 100644 --- a/py-polars/tests/unit/sql/test_sql.py +++ b/py-polars/tests/unit/sql/test_sql.py @@ -57,11 +57,11 @@ def test_sql_cast() -> None: "b_i16": pl.Int16, "b_i64": pl.Int64, "d_i8": pl.Int8, - "a_char": pl.Utf8, - "b_varchar": pl.Utf8, + "a_char": pl.String, + "b_varchar": pl.String, "c_blob": pl.Binary, "c_varbinary": pl.Binary, - "d_charvar": pl.Utf8, + "d_charvar": pl.String, } assert res.rows() == [ (1.0, 1.0, 1, 1, 1, 1, "1", "1.1", b"a", b"a", "true"), @@ -562,7 +562,7 @@ def test_sql_limit_offset() -> None: ), ( "SELECT * FROM tbl_a LEFT ANTI JOIN tbl_b USING (a)", - pl.DataFrame(schema={"a": pl.Int64, "b": pl.Int64, "c": pl.Utf8}), + pl.DataFrame(schema={"a": pl.Int64, "b": pl.Int64, "c": pl.String}), ), ( "SELECT * FROM tbl_a LEFT SEMI JOIN tbl_b USING (b) LEFT SEMI JOIN tbl_c USING (c)", diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py index 28bceef75d21..191759c9c6ea 100644 --- a/py-polars/tests/unit/streaming/test_streaming.py +++ b/py-polars/tests/unit/streaming/test_streaming.py @@ -249,7 +249,7 @@ def test_streaming_9776() -> None: def test_stream_empty_file(tmp_path: Path) -> None: p = tmp_path / "in.parquet" schema = { - "KLN_NR": pl.Utf8, + "KLN_NR": pl.String, } df = pl.DataFrame( diff --git a/py-polars/tests/unit/streaming/test_streaming_group_by.py b/py-polars/tests/unit/streaming/test_streaming_group_by.py index 66078051db2e..d3b76294f986 100644 --- a/py-polars/tests/unit/streaming/test_streaming_group_by.py +++ b/py-polars/tests/unit/streaming/test_streaming_group_by.py @@ -74,10 +74,10 @@ def test_streaming_group_by_types() -> None: .collect(streaming=True) ) assert out.schema == { - "str_first": pl.Utf8, - "str_last": pl.Utf8, - "str_mean": pl.Utf8, - "str_sum": pl.Utf8, + "str_first": pl.String, + "str_last": pl.String, + "str_mean": pl.String, + "str_sum": pl.String, "bool_first": pl.Boolean, "bool_last": pl.Boolean, "bool_mean": pl.Boolean, @@ -150,7 +150,7 @@ def test_streaming_non_streaming_gb() -> None: q = df.lazy().group_by("a").agg(pl.count()).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) - q = df.lazy().with_columns(pl.col("a").cast(pl.Utf8)) + q = df.lazy().with_columns(pl.col("a").cast(pl.String)) q = q.group_by("a").agg(pl.count()).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) q = df.lazy().with_columns(pl.col("a").alias("b")) diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py index 0a37e38deaab..697e90f2985e 100644 --- a/py-polars/tests/unit/streaming/test_streaming_io.py +++ b/py-polars/tests/unit/streaming/test_streaming_io.py @@ -38,7 +38,7 @@ def test_scan_csv_overwrite_small_dtypes( ) -> None: file_path = io_files_path / "foods1.csv" df = pl.scan_csv(file_path, dtypes={"sugars_g": dtype}).collect(streaming=True) - assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float64, dtype] + assert df.dtypes == [pl.String, pl.Int64, pl.Float64, dtype] @pytest.mark.write_disk() diff --git a/py-polars/tests/unit/test_constructors.py b/py-polars/tests/unit/test_constructors.py index ca20746c036d..40642b456af7 100644 --- a/py-polars/tests/unit/test_constructors.py +++ b/py-polars/tests/unit/test_constructors.py @@ -101,11 +101,11 @@ def test_init_dict() -> None: assert df.schema == {"a": pl.Null, "b": pl.Null} for df in ( - pl.DataFrame({}, schema={"a": pl.Date, "b": pl.Utf8}), - pl.DataFrame({"a": [], "b": []}, schema={"a": pl.Date, "b": pl.Utf8}), + pl.DataFrame({}, schema={"a": pl.Date, "b": pl.String}), + pl.DataFrame({"a": [], "b": []}, schema={"a": pl.Date, "b": pl.String}), ): assert df.shape == (0, 2) - assert df.schema == {"a": pl.Date, "b": pl.Utf8} + assert df.schema == {"a": pl.Date, "b": pl.String} # List of empty list df = pl.DataFrame({"a": [[]], "b": [[]]}) @@ -227,7 +227,7 @@ class TradeNT(NamedTuple): df = DF(data=trades) # type: ignore[operator] assert df.schema == { "timestamp": pl.Datetime("us"), - "ticker": pl.Utf8, + "ticker": pl.String, "price": pl.Decimal(scale=1), "size": pl.Int64, } @@ -240,7 +240,7 @@ class TradeNT(NamedTuple): ) assert df.schema == { "timestamp": pl.Datetime("ms"), - "ticker": pl.Utf8, + "ticker": pl.String, "price": pl.Decimal(scale=1), "size": pl.Int32, } @@ -362,7 +362,7 @@ def test_init_structured_objects_nested() -> None: "x": pl.Int64, "y": pl.Struct( [ - pl.Field("a", pl.Utf8), + pl.Field("a", pl.String), pl.Field("b", pl.Int64), pl.Field( "c", @@ -370,7 +370,7 @@ def test_init_structured_objects_nested() -> None: [ pl.Field("d", pl.Datetime("us")), pl.Field("e", pl.Float64), - pl.Field("f", pl.Utf8), + pl.Field("f", pl.String), ] ), ), @@ -395,7 +395,7 @@ def test_init_structured_objects_nested() -> None: "x": pl.Int16, "y": pl.Struct( [ - pl.Field("a", pl.Utf8), + pl.Field("a", pl.String), pl.Field("b", pl.Int32), pl.Field( name="c", @@ -403,7 +403,7 @@ def test_init_structured_objects_nested() -> None: [ pl.Field("d", pl.Datetime("ms")), pl.Field("e", pl.Float32), - pl.Field("f", pl.Utf8), + pl.Field("f", pl.String), ] ), ), @@ -429,11 +429,11 @@ def test_init_structured_objects_nested() -> None: # └─────┴───────┴─────┴─────────────────────┴───────┴───────┘ assert df.schema == { "x": pl.Int16, - "a": pl.Utf8, + "a": pl.String, "b": pl.Int32, "d": pl.Datetime("ms"), "e": pl.Float32, - "f": pl.Utf8, + "f": pl.String, } assert df.row(0) == ( 100, @@ -517,7 +517,7 @@ def test_init_ndarray() -> None: orient="row", ) assert df.rows() == [(True, 2, "a"), (None, None, None)] - assert df.schema == {"x": pl.Boolean, "y": pl.Int32, "z": pl.Utf8} + assert df.schema == {"x": pl.Boolean, "y": pl.Int32, "z": pl.String} # 2D array - default to column orientation df = pl.DataFrame(np.array([[1, 2], [3, 4]], dtype=np.int64)) @@ -1055,13 +1055,13 @@ def test_from_dicts_list_struct_without_inner_dtype_5611() -> None: def test_upcast_primitive_and_strings() -> None: assert pl.Series([1, 1.0, 1]).dtype == pl.Float64 - assert pl.Series([1, 1, "1.0"]).dtype == pl.Utf8 - assert pl.Series([1, 1.0, "1.0"]).dtype == pl.Utf8 + assert pl.Series([1, 1, "1.0"]).dtype == pl.String + assert pl.Series([1, 1.0, "1.0"]).dtype == pl.String assert pl.Series([True, 1]).dtype == pl.Int64 assert pl.Series([True, 1.0]).dtype == pl.Float64 assert pl.Series([True, 1], dtype=pl.Boolean).dtype == pl.Boolean assert pl.Series([False, 1.0], dtype=pl.Boolean).dtype == pl.Boolean - assert pl.Series([False, "1.0"]).dtype == pl.Utf8 + assert pl.Series([False, "1.0"]).dtype == pl.String assert pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]}).dtypes == [ pl.Float64, pl.Float64, @@ -1100,10 +1100,10 @@ def test_from_rows_dtype() -> None: # 5182 df = pl.DataFrame( data=[(None, None)] * 50 + [("1.23", None)], - schema=[("foo", pl.Utf8), ("bar", pl.Utf8)], + schema=[("foo", pl.String), ("bar", pl.String)], orient="row", ) - assert df.dtypes == [pl.Utf8, pl.Utf8] + assert df.dtypes == [pl.String, pl.String] assert df.null_count().row(0) == (50, 51) type1 = [{"c1": 206, "c2": "type1", "c3": {"x1": "abcd", "x2": "jkl;"}}] @@ -1155,7 +1155,7 @@ def test_from_dicts_schema() -> None: # provide data that resolves to an empty frame (ref: scalar # expansion shortcut), with schema/override hints - schema = {"colx": pl.Utf8, "coly": pl.Int32} + schema = {"colx": pl.String, "coly": pl.Int32} for param in ("schema", "schema_overrides"): df = pl.DataFrame({"colx": [], "coly": 0}, **{param: schema}) # type: ignore[arg-type] @@ -1219,7 +1219,7 @@ def test_nested_read_dict_4143_2() -> None: assert result.dtypes == [ pl.Int64, - pl.List(pl.Struct({"some_text_here": pl.Utf8, "list_": pl.List(pl.Int64)})), + pl.List(pl.Struct({"some_text_here": pl.String, "list_": pl.List(pl.Int64)})), ] expected = { "id": [1, 2], @@ -1249,7 +1249,7 @@ def test_from_records_nullable_structs() -> None: "items", pl.List( pl.Struct( - [pl.Field("item_id", pl.UInt32), pl.Field("description", pl.Utf8)] + [pl.Field("item_id", pl.UInt32), pl.Field("description", pl.String)] ) ), ), @@ -1311,7 +1311,7 @@ def test_nested_schema_construction() -> None: pl.List( pl.Struct( [ - pl.Field("name", pl.Utf8), + pl.Field("name", pl.String), pl.Field( "sub_nodes", pl.List( @@ -1363,7 +1363,10 @@ def test_nested_schema_construction2() -> None: "nodes", pl.List( pl.Struct( - [pl.Field("name", pl.Utf8), pl.Field("time", pl.UInt32)] + [ + pl.Field("name", pl.String), + pl.Field("time", pl.UInt32), + ] ) ), ) diff --git a/py-polars/tests/unit/test_datatypes.py b/py-polars/tests/unit/test_datatypes.py index 39b05e50c389..59bf304a2bb1 100644 --- a/py-polars/tests/unit/test_datatypes.py +++ b/py-polars/tests/unit/test_datatypes.py @@ -26,7 +26,7 @@ | pl.FLOAT_DTYPES | { pl.Boolean, - pl.Utf8, + pl.String, pl.Binary, pl.Time, pl.Date, @@ -138,8 +138,8 @@ def test_dtypes_hashable() -> None: (pl.List(pl.Duration(time_unit="ns")), "List(Duration(time_unit='ns'))"), (pl.Struct, "Struct"), ( - pl.Struct({"name": pl.Utf8, "ids": pl.List(pl.UInt32)}), - "Struct({'name': Utf8, 'ids': List(UInt32)})", + pl.Struct({"name": pl.String, "ids": pl.List(pl.UInt32)}), + "Struct({'name': String, 'ids': List(UInt32)})", ), ], ) diff --git a/py-polars/tests/unit/test_empty.py b/py-polars/tests/unit/test_empty.py index 184839f4344f..aab1670e6c56 100644 --- a/py-polars/tests/unit/test_empty.py +++ b/py-polars/tests/unit/test_empty.py @@ -5,11 +5,11 @@ def test_empty_str_concat_lit() -> None: - df = pl.DataFrame({"a": [], "b": []}, schema=[("a", pl.Utf8), ("b", pl.Utf8)]) + df = pl.DataFrame({"a": [], "b": []}, schema=[("a", pl.String), ("b", pl.String)]) assert df.with_columns(pl.lit("asd") + pl.col("a")).schema == { - "a": pl.Utf8, - "b": pl.Utf8, - "literal": pl.Utf8, + "a": pl.String, + "b": pl.String, + "literal": pl.String, } @@ -27,7 +27,7 @@ def test_empty_cross_join() -> None: def test_empty_string_replace() -> None: - s = pl.Series("", [], dtype=pl.Utf8) + s = pl.Series("", [], dtype=pl.String) assert_series_equal(s.str.replace("a", "b", literal=True), s) assert_series_equal(s.str.replace("a", "b"), s) assert_series_equal(s.str.replace("ab", "b", literal=True), s) @@ -37,10 +37,10 @@ def test_empty_string_replace() -> None: def test_empty_window_function() -> None: expr = (pl.col("VAL") / pl.col("VAL").sum()).over("KEY") - df = pl.DataFrame(schema={"KEY": pl.Utf8, "VAL": pl.Float64}) + df = pl.DataFrame(schema={"KEY": pl.String, "VAL": pl.Float64}) df.select(expr) # ComputeError - lf = pl.DataFrame(schema={"KEY": pl.Utf8, "VAL": pl.Float64}).lazy() + lf = pl.DataFrame(schema={"KEY": pl.String, "VAL": pl.Float64}).lazy() expected = pl.DataFrame(schema={"VAL": pl.Float64}) assert_frame_equal(lf.select(expr).collect(), expected) @@ -48,7 +48,7 @@ def test_empty_window_function() -> None: def test_empty_count_window() -> None: df = pl.DataFrame( {"ID": [], "DESC": [], "dataset": []}, - schema={"ID": pl.Utf8, "DESC": pl.Utf8, "dataset": pl.Utf8}, + schema={"ID": pl.String, "DESC": pl.String, "dataset": pl.String}, ) out = df.select(pl.col("ID").count().over(["ID", "DESC"])) @@ -75,7 +75,7 @@ def test_empty_9137() -> None: assert out.dtypes == [pl.Float32, pl.Float32] -@pytest.mark.parametrize("dtype", [pl.Utf8, pl.Binary, pl.UInt32]) +@pytest.mark.parametrize("dtype", [pl.String, pl.Binary, pl.UInt32]) @pytest.mark.parametrize( "set_operation", ["set_intersection", "set_union", "set_difference", "set_symmetric_difference"], @@ -120,7 +120,7 @@ def test_empty_set_symteric_difference() -> None: @pytest.mark.parametrize("name", ["sort", "unique", "head", "tail", "shift", "reverse"]) def test_empty_list_namespace_output_9585(name: str) -> None: - dtype = pl.List(pl.Utf8) + dtype = pl.List(pl.String) df = pl.DataFrame([[None]], schema={"A": dtype}) expr = getattr(pl.col("A").list, name)() diff --git a/py-polars/tests/unit/test_expr_multi_cols.py b/py-polars/tests/unit/test_expr_multi_cols.py index 2d7dc9383557..9cd259122940 100644 --- a/py-polars/tests/unit/test_expr_multi_cols.py +++ b/py-polars/tests/unit/test_expr_multi_cols.py @@ -6,7 +6,7 @@ def test_exclude_name_from_dtypes() -> None: df = pl.DataFrame({"a": ["a"], "b": ["b"]}) assert_frame_equal( - df.with_columns(pl.col(pl.Utf8).exclude("a").name.suffix("_foo")), + df.with_columns(pl.col(pl.String).exclude("a").name.suffix("_foo")), pl.DataFrame({"a": ["a"], "b": ["b"], "b_foo": ["b"]}), ) diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py index e7643ce4598c..97a82203ee4c 100644 --- a/py-polars/tests/unit/test_exprs.py +++ b/py-polars/tests/unit/test_exprs.py @@ -401,7 +401,7 @@ def test_rank_string_null_11252() -> None: def test_unique_empty() -> None: - for dt in [pl.Utf8, pl.Boolean, pl.Int32, pl.UInt32]: + for dt in [pl.String, pl.Boolean, pl.Int32, pl.UInt32]: s = pl.Series([], dtype=dt) assert_series_equal(s.unique(), s) @@ -592,12 +592,12 @@ def test_lit_dtype_utc() -> None: (("a", "b"), ["c"]), ((["a", "b"],), ["c"]), ((pl.Int64,), ["c"]), - ((pl.Utf8, pl.Float32), ["a", "b"]), - (([pl.Utf8, pl.Float32],), ["a", "b"]), + ((pl.String, pl.Float32), ["a", "b"]), + (([pl.String, pl.Float32],), ["a", "b"]), ], ) def test_exclude(input: tuple[Any, ...], expected: list[str]) -> None: - df = pl.DataFrame(schema={"a": pl.Int64, "b": pl.Int64, "c": pl.Utf8}) + df = pl.DataFrame(schema={"a": pl.Int64, "b": pl.Int64, "c": pl.String}) assert df.select(pl.all().exclude(*input)).columns == expected @@ -722,7 +722,7 @@ def test_tail() -> None: (4, pl.UInt32), (4.5, pl.Float32), (None, pl.Float64), - ("白鵬翔", pl.Utf8), + ("白鵬翔", pl.String), (date.today(), pl.Date), (datetime.now(), pl.Datetime("ns")), (time(23, 59, 59), pl.Time), @@ -746,7 +746,7 @@ def test_extend_constant(const: Any, dtype: pl.PolarsDataType) -> None: (4, pl.UInt32), (4.5, pl.Float32), (None, pl.Float64), - ("白鵬翔", pl.Utf8), + ("白鵬翔", pl.String), (date.today(), pl.Date), (datetime.now(), pl.Datetime("ns")), (time(23, 59, 59), pl.Time), @@ -789,7 +789,7 @@ def test_repr_short_expression() -> None: def test_repr_long_expression() -> None: - expr = pl.functions.col(pl.Utf8).str.count_matches("") + expr = pl.functions.col(pl.String).str.count_matches("") # we cut off the last ten characters because that includes the # memory location which will vary between runs diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index 323d601b5dbb..a6a26e8616f9 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -47,7 +47,7 @@ def test_lazy() -> None: [ ({}, "0 cols, {}"), ({"a": [1]}, '1 col, {"a": Int64}'), - ({"a": [1], "b": ["B"]}, '2 cols, {"a": Int64, "b": Utf8}'), + ({"a": [1], "b": ["B"]}, '2 cols, {"a": Int64, "b": String}'), ({"a": [1], "b": ["B"], "c": [0.0]}, '3 cols, {"a": Int64 … "c": Float64}'), ], ) @@ -590,11 +590,11 @@ def test_cast_frame() -> None: # cast via col:dtype map assert lf.cast( - dtypes={"b": pl.Float32, "c": pl.Utf8, "d": pl.Datetime("ms")} + dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")} ).schema == { "a": pl.Float64, "b": pl.Float32, - "c": pl.Utf8, + "c": pl.String, "d": pl.Datetime("ms"), } @@ -603,10 +603,10 @@ def test_cast_frame() -> None: { cs.float(): pl.UInt8, cs.integer(): pl.Int32, - cs.temporal(): pl.Utf8, + cs.temporal(): pl.String, } ) - assert lfc.schema == {"a": pl.UInt8, "b": pl.Int32, "c": pl.Boolean, "d": pl.Utf8} + assert lfc.schema == {"a": pl.UInt8, "b": pl.Int32, "c": pl.Boolean, "d": pl.String} assert lfc.collect().rows() == [ (1, 4, True, "2020-01-02"), (2, 5, False, "2021-03-04"), @@ -614,7 +614,7 @@ def test_cast_frame() -> None: ] # cast all fields to a single type - result = lf.cast(pl.Utf8) + result = lf.cast(pl.String) expected = pl.LazyFrame( { "a": ["1.0", "2.5", "3.0"], @@ -1169,7 +1169,7 @@ def test_lazy_schema() -> None: "ham": ["a", "b", "c"], } ) - assert ldf.schema == {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.Utf8} + assert ldf.schema == {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String} ldf = pl.LazyFrame( { @@ -1178,7 +1178,7 @@ def test_lazy_schema() -> None: "ham": ["a", "b", "c"], } ) - assert ldf.dtypes == [pl.Int64, pl.Float64, pl.Utf8] + assert ldf.dtypes == [pl.Int64, pl.Float64, pl.String] ldfe = ldf.clear() assert ldfe.schema == ldf.schema @@ -1366,8 +1366,8 @@ def test_from_epoch(input_dtype: pl.PolarsDataType) -> None: def test_from_epoch_str() -> None: ldf = pl.LazyFrame( [ - pl.Series("timestamp_ms", [1147880044 * 1_000]).cast(pl.Utf8), - pl.Series("timestamp_us", [1147880044 * 1_000_000]).cast(pl.Utf8), + pl.Series("timestamp_ms", [1147880044 * 1_000]).cast(pl.String), + pl.Series("timestamp_us", [1147880044 * 1_000_000]).cast(pl.String), ] ) diff --git a/py-polars/tests/unit/test_predicates.py b/py-polars/tests/unit/test_predicates.py index f59a90fcf8a3..2293eaee1825 100644 --- a/py-polars/tests/unit/test_predicates.py +++ b/py-polars/tests/unit/test_predicates.py @@ -239,7 +239,7 @@ def test_invalid_filter_predicates(predicate: Any) -> None: def test_fast_path_boolean_filter_predicates() -> None: df = pl.DataFrame({"colx": ["aa", "bb", "cc", "dd"]}) - assert_frame_equal(df.filter(False), pl.DataFrame(schema={"colx": pl.Utf8})) + assert_frame_equal(df.filter(False), pl.DataFrame(schema={"colx": pl.String})) assert_frame_equal(df.filter(True), df) diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index e5a3df76bebb..464e471e7df6 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -309,9 +309,9 @@ def test_projection_join_names_9955() -> None: q = q.select(batting.columns) assert q.collect().schema == { - "playerID": pl.Utf8, + "playerID": pl.String, "yearID": pl.Int64, - "lgID": pl.Utf8, + "lgID": pl.String, } diff --git a/py-polars/tests/unit/test_queries.py b/py-polars/tests/unit/test_queries.py index ed00a1e0d193..08edd662a7d4 100644 --- a/py-polars/tests/unit/test_queries.py +++ b/py-polars/tests/unit/test_queries.py @@ -160,7 +160,7 @@ def test_group_by_agg_equals_zero_3535() -> None: ("cc", None, 0.0), ], schema=[ - ("key", pl.Utf8), + ("key", pl.String), ("val1", pl.Int16), ("val2", pl.Float32), ], @@ -288,7 +288,7 @@ def test_edge_cast_string_duplicates_4259() -> None: "a": [99, 54612, 546121], "b": [1, 14484, 4484], } - ).with_columns(pl.all().cast(pl.Utf8)) + ).with_columns(pl.all().cast(pl.String)) mask = df.select(["a", "b"]).is_duplicated() df_filtered = df.filter(pl.lit(mask)) diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index e0121da51ed7..d164dc8c63f6 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -52,7 +52,7 @@ def test_schema_on_agg() -> None: ] ) ).schema == { - "a": pl.Utf8, + "a": pl.String, "min": pl.Int64, "max": pl.Int64, "sum": pl.Int64, @@ -213,7 +213,7 @@ def custom2( df: pl.DataFrame, ) -> pl.DataFrame: # changes schema - return df.select(pl.all().cast(pl.Utf8)) + return df.select(pl.all().cast(pl.String)) with pytest.raises( pl.ComputeError, @@ -309,7 +309,7 @@ def test_shrink_dtype() -> None: pl.Int32, pl.Int8, pl.Int16, - pl.Utf8, + pl.String, pl.Float32, pl.Boolean, pl.UInt8, @@ -369,7 +369,7 @@ def test_lazy_rename() -> None: def test_all_null_cast_5826() -> None: - df = pl.DataFrame(data=[pl.Series("a", [None], dtype=pl.Utf8)]) + df = pl.DataFrame(data=[pl.Series("a", [None], dtype=pl.String)]) out = df.with_columns(pl.col("a").cast(pl.Boolean)) assert out.dtypes == [pl.Boolean] assert out.item() is None @@ -390,8 +390,8 @@ def test_list_eval_type_cast_11188() -> None: schema={"a": pl.List(pl.Int64)}, ) assert df.select( - pl.col("a").list.eval(pl.element().cast(pl.Utf8)).alias("a_str") - ).schema == {"a_str": pl.List(pl.Utf8)} + pl.col("a").list.eval(pl.element().cast(pl.String)).alias("a_str") + ).schema == {"a_str": pl.List(pl.String)} def test_schema_true_divide_6643() -> None: @@ -427,7 +427,7 @@ def test_from_dicts_all_cols_6716() -> None: pl.ComputeError, match="make sure that all rows have the same schema" ): pl.from_dicts(dicts, infer_schema_length=20) - assert pl.from_dicts(dicts, infer_schema_length=None).dtypes == [pl.Utf8] + assert pl.from_dicts(dicts, infer_schema_length=None).dtypes == [pl.String] def test_from_dicts_empty() -> None: @@ -508,8 +508,8 @@ def sub_col_min(column: str, min_column: str) -> pl.Expr: ( {"x": ["x"], "y": ["y"]}, pl.coalesce(pl.col("x"), pl.col("y")), - {"x": pl.Utf8}, - {"x": pl.List(pl.Utf8)}, + {"x": pl.String}, + {"x": pl.List(pl.String)}, ), ( {"x": [True]}, diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py index f56e3a973381..154d71df8362 100644 --- a/py-polars/tests/unit/test_selectors.py +++ b/py-polars/tests/unit/test_selectors.py @@ -30,7 +30,7 @@ def df() -> pl.DataFrame: "JJK": pl.Date, "Lmn": pl.Duration, "opp": pl.Datetime("ms"), - "qqR": pl.Utf8, + "qqR": pl.String, }, ) return df @@ -53,7 +53,7 @@ def test_selector_by_dtype(df: pl.DataFrame) -> None: "def": pl.Float32, "eee": pl.Boolean, "fgg": pl.Boolean, - "qqR": pl.Utf8, + "qqR": pl.String, } @@ -280,7 +280,7 @@ def test_selector_miscellaneous(df: pl.DataFrame) -> None: assert df.select(cs.categorical()).columns == [] test_schema = { - "abc": pl.Utf8, + "abc": pl.String, "mno": pl.Binary, "tuv": pl.Object, "xyz": pl.Categorical, @@ -381,8 +381,8 @@ def test_selector_repr() -> None: cs.integer() & cs.matches("z"), "(cs.integer() & cs.matches(pattern='z'))" ) assert_repr_equals( - cs.temporal() | cs.by_dtype(pl.Utf8) & cs.string(include_categorical=False), - "(cs.temporal() | (cs.by_dtype(dtypes=[Utf8]) & cs.string(include_categorical=False)))", + cs.temporal() | cs.by_dtype(pl.String) & cs.string(include_categorical=False), + "(cs.temporal() | (cs.by_dtype(dtypes=[String]) & cs.string(include_categorical=False)))", ) @@ -394,7 +394,7 @@ def test_selector_sets(df: pl.DataFrame) -> None: "JJK": pl.Date, "Lmn": pl.Duration, "opp": pl.Datetime("ms"), - "qqR": pl.Utf8, + "qqR": pl.String, } # and @@ -419,7 +419,7 @@ def test_selector_sets(df: pl.DataFrame) -> None: "fgg": pl.Boolean, "JJK": pl.Date, "opp": pl.Datetime("ms"), - "qqR": pl.Utf8, + "qqR": pl.String, } diff --git a/py-polars/tests/unit/test_serde.py b/py-polars/tests/unit/test_serde.py index 08d7cca15f5a..7816873633b1 100644 --- a/py-polars/tests/unit/test_serde.py +++ b/py-polars/tests/unit/test_serde.py @@ -126,7 +126,7 @@ def test_pickle_udf_expression() -> None: expected = pl.DataFrame({"a": [2, 4, 6]}) assert_frame_equal(result, expected) - e = pl.col("a").map_batches(times2, return_dtype=pl.Utf8) + e = pl.col("a").map_batches(times2, return_dtype=pl.String) b = pickle.dumps(e) e = pickle.loads(b) @@ -187,7 +187,7 @@ def test_serde_categorical_series_10586() -> None: def test_serde_keep_dtype_empty_list() -> None: - s = pl.Series([{"a": None}], dtype=pl.Struct([pl.Field("a", pl.List(pl.Utf8))])) + s = pl.Series([{"a": None}], dtype=pl.Struct([pl.Field("a", pl.List(pl.String))])) assert s.dtype == pickle.loads(pickle.dumps(s)).dtype diff --git a/py-polars/tests/unit/testing/test_assert_frame_equal.py b/py-polars/tests/unit/testing/test_assert_frame_equal.py index d0c6fda61e2f..a5d00abc6eb9 100644 --- a/py-polars/tests/unit/testing/test_assert_frame_equal.py +++ b/py-polars/tests/unit/testing/test_assert_frame_equal.py @@ -58,8 +58,8 @@ id="equal_int", ), pytest.param( - pl.DataFrame({"a": ["a", "b", "c"]}, schema={"a": pl.Utf8}), - pl.DataFrame({"a": ["a", "b", "c"]}, schema={"a": pl.Utf8}), + pl.DataFrame({"a": ["a", "b", "c"]}, schema={"a": pl.String}), + pl.DataFrame({"a": ["a", "b", "c"]}, schema={"a": pl.String}), {}, id="equal_str", ), diff --git a/py-polars/tests/unit/testing/test_assert_series_equal.py b/py-polars/tests/unit/testing/test_assert_series_equal.py index d2b52af16934..0def3fc26c3f 100644 --- a/py-polars/tests/unit/testing/test_assert_series_equal.py +++ b/py-polars/tests/unit/testing/test_assert_series_equal.py @@ -233,8 +233,8 @@ def test_assert_series_equal_temporal(data1: Any, data2: Any) -> None: id="equal_int", ), pytest.param( - pl.Series(["a", "b", "c"], dtype=pl.Utf8), - pl.Series(["a", "b", "c"], dtype=pl.Utf8), + pl.Series(["a", "b", "c"], dtype=pl.String), + pl.Series(["a", "b", "c"], dtype=pl.String), {}, id="equal_str", ), @@ -471,7 +471,7 @@ def test_assert_series_equal_raises_assertion_error( def test_assert_series_equal_categorical_vs_str() -> None: s1 = pl.Series(["a", "b", "a"], dtype=pl.Categorical) - s2 = pl.Series(["a", "b", "a"], dtype=pl.Utf8) + s2 = pl.Series(["a", "b", "a"], dtype=pl.String) with pytest.raises(AssertionError, match="dtype mismatch"): assert_series_equal(s1, s2, categorical_as_str=True)