Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(rust!): Rename Utf8 data type to String #13224

Merged
merged 36 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c87a501
DataType rename on Rust side
stinodego Dec 23, 2023
42258f3
Utf8Chunked
stinodego Dec 23, 2023
b197572
Utf8ChunkedBuilder
stinodego Dec 23, 2023
802a843
ListUtf8ChunkedBuilder
stinodego Dec 23, 2023
71a922b
Utf8ChunkedBuilderCow
stinodego Dec 23, 2023
ffa278f
AnyValue Utf8
stinodego Dec 23, 2023
3e95b62
DataType Utf8
stinodego Dec 23, 2023
de47a62
DataType Utf8
stinodego Dec 23, 2023
577da8f
repr
stinodego Dec 23, 2023
4ffa63b
AnyValue Utf8Owned
stinodego Dec 23, 2023
92de0c7
ChunkedArray utf8
stinodego Dec 23, 2023
93be312
as_utf8
stinodego Dec 23, 2023
020da9c
from_utf_to_enum
stinodego Dec 23, 2023
8878fc1
Serializable DataType Utf8
stinodego Dec 23, 2023
b95a236
AnyValueBuffer Utf8
stinodego Dec 23, 2023
bcb0c6b
Utf8Type
stinodego Dec 23, 2023
c4983d7
BinaryChunked to_utf8
stinodego Dec 23, 2023
3f0d015
boolean_to_utf8
stinodego Dec 23, 2023
e119c21
refs
stinodego Dec 23, 2023
df5200a
utf8 modules core
stinodego Dec 23, 2023
56b0abf
polars-time utf8 module
stinodego Dec 23, 2023
ea63ffe
polars core utf8 module
stinodego Dec 23, 2023
278bfd0
get_utf8_builder
stinodego Dec 23, 2023
43ad9d2
LargeUtf8Array
stinodego Dec 23, 2023
625a0f7
py-polars
stinodego Dec 23, 2023
c732bfa
String methods
stinodego Dec 23, 2023
6a65c0e
refs
stinodego Dec 23, 2023
75f4b6f
Refs
stinodego Dec 23, 2023
04f313b
LiteralValue Utf8
stinodego Dec 23, 2023
2ac74b9
Namespace
stinodego Dec 23, 2023
d424fd1
More refs
stinodego Dec 23, 2023
42aa9f4
Fix tests for error messages
stinodego Dec 23, 2023
5ccdd86
PyO3 Bindings
stinodego Dec 23, 2023
965e169
Accessor string to str
stinodego Dec 25, 2023
fee1d1b
Fix docs
stinodego Dec 25, 2023
e11f2b4
More updated
stinodego Dec 25, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ lazy = []
# more fast paths, slower compilation
performant = ["arrow/performant", "reinterpret"]

# extra utilities for Utf8Chunked
# extra utilities for StringChunked
strings = ["regex", "arrow/strings", "polars-error/regex"]
# support for ObjectChunked<T> (downcastable Series of any type)
object = ["serde_json"]
Expand Down
16 changes: 8 additions & 8 deletions crates/polars-core/src/chunked_array/arithmetic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,27 +71,27 @@ fn concat_binary_arrs(l: &[u8], r: &[u8], buf: &mut Vec<u8>) {
buf.extend_from_slice(r);
}

impl Add for &Utf8Chunked {
type Output = Utf8Chunked;
impl Add for &StringChunked {
type Output = StringChunked;

fn add(self, rhs: Self) -> Self::Output {
unsafe { (self.as_binary() + rhs.as_binary()).to_utf8() }
unsafe { (self.as_binary() + rhs.as_binary()).to_string() }
}
}

impl Add for Utf8Chunked {
type Output = Utf8Chunked;
impl Add for StringChunked {
type Output = StringChunked;

fn add(self, rhs: Self) -> Self::Output {
(&self).add(&rhs)
}
}

impl Add<&str> for &Utf8Chunked {
type Output = Utf8Chunked;
impl Add<&str> for &StringChunked {
type Output = StringChunked;

fn add(self, rhs: &str) -> Self::Output {
unsafe { ((&self.as_binary()) + rhs.as_bytes()).to_utf8() }
unsafe { ((&self.as_binary()) + rhs.as_bytes()).to_string() }
}
}

Expand Down
14 changes: 7 additions & 7 deletions crates/polars-core/src/chunked_array/builder/list/binary.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
use super::*;

pub struct ListUtf8ChunkedBuilder {
pub struct ListStringChunkedBuilder {
builder: LargeListUtf8Builder,
field: Field,
fast_explode: bool,
}

impl ListUtf8ChunkedBuilder {
impl ListStringChunkedBuilder {
pub fn new(name: &str, capacity: usize, values_capacity: usize) -> Self {
let values = MutableUtf8Array::<i64>::with_capacity(values_capacity);
let builder = LargeListUtf8Builder::new_with_capacity(values, capacity);
let field = Field::new(name, DataType::List(Box::new(DataType::Utf8)));
let field = Field::new(name, DataType::List(Box::new(DataType::String)));

ListUtf8ChunkedBuilder {
ListStringChunkedBuilder {
builder,
field,
fast_explode: true,
Expand Down Expand Up @@ -47,7 +47,7 @@ impl ListUtf8ChunkedBuilder {
}

#[inline]
pub(crate) fn append(&mut self, ca: &Utf8Chunked) {
pub(crate) fn append(&mut self, ca: &StringChunked) {
if ca.is_empty() {
self.fast_explode = false;
}
Expand All @@ -57,7 +57,7 @@ impl ListUtf8ChunkedBuilder {
}
}

impl ListBuilderTrait for ListUtf8ChunkedBuilder {
impl ListBuilderTrait for ListStringChunkedBuilder {
#[inline]
fn append_null(&mut self) {
self.fast_explode = false;
Expand All @@ -69,7 +69,7 @@ impl ListBuilderTrait for ListUtf8ChunkedBuilder {
if s.is_empty() {
self.fast_explode = false;
}
let ca = s.utf8()?;
let ca = s.str()?;
self.append(ca);
Ok(())
}
Expand Down
6 changes: 3 additions & 3 deletions crates/polars-core/src/chunked_array/builder/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,10 @@ pub fn get_list_builder(
Box::new(builder)
}};
}
macro_rules! get_utf8_builder {
macro_rules! get_string_builder {
() => {{
let builder =
ListUtf8ChunkedBuilder::new(&name, list_capacity, 5 * value_capacity);
ListStringChunkedBuilder::new(&name, list_capacity, 5 * value_capacity);
Box::new(builder)
}};
}
Expand All @@ -167,7 +167,7 @@ pub fn get_list_builder(
Ok(match_dtype_to_logical_apply_macro!(
physical_type,
get_primitive_builder,
get_utf8_builder,
get_string_builder,
get_binary_builder,
get_bool_builder
))
Expand Down
10 changes: 5 additions & 5 deletions crates/polars-core/src/chunked_array/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pub mod fixed_size_list;
pub mod list;
mod null;
mod primitive;
mod utf8;
mod string;

use std::borrow::Cow;
use std::iter::FromIterator;
Expand All @@ -21,7 +21,7 @@ pub(crate) use fixed_size_list::*;
pub use list::*;
pub use null::*;
pub use primitive::*;
pub use utf8::*;
pub use string::*;

use crate::chunked_array::to_primitive;
use crate::prelude::*;
Expand Down Expand Up @@ -123,7 +123,7 @@ impl NewChunkedArray<BooleanType, bool> for BooleanChunked {
}
}

impl<S> NewChunkedArray<Utf8Type, S> for Utf8Chunked
impl<S> NewChunkedArray<StringType, S> for StringChunked
where
S: AsRef<str>,
{
Expand All @@ -148,15 +148,15 @@ where

fn from_iter_options(name: &str, it: impl Iterator<Item = Option<S>>) -> Self {
let cap = get_iter_capacity(&it);
let mut builder = Utf8ChunkedBuilder::new(name, cap, cap * 5);
let mut builder = StringChunkedBuilder::new(name, cap, cap * 5);
it.for_each(|opt| builder.append_option(opt));
builder.finish()
}

/// Create a new ChunkedArray from an iterator.
fn from_iter_values(name: &str, it: impl Iterator<Item = S>) -> Self {
let cap = get_iter_capacity(&it);
let mut builder = Utf8ChunkedBuilder::new(name, cap, cap * 5);
let mut builder = StringChunkedBuilder::new(name, cap, cap * 5);
it.for_each(|v| builder.append_value(v));
builder.finish()
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
use super::*;

#[derive(Clone)]
pub struct Utf8ChunkedBuilder {
pub struct StringChunkedBuilder {
pub(crate) builder: MutableUtf8Array<i64>,
pub capacity: usize,
pub(crate) field: Field,
}

impl Utf8ChunkedBuilder {
/// Create a new UtfChunkedBuilder
impl StringChunkedBuilder {
/// Create a new StringChunkedBuilder
///
/// # Arguments
///
/// * `capacity` - Number of string elements in the final array.
/// * `bytes_capacity` - Number of bytes needed to store the string values.
pub fn new(name: &str, capacity: usize, bytes_capacity: usize) -> Self {
Utf8ChunkedBuilder {
StringChunkedBuilder {
builder: MutableUtf8Array::<i64>::with_capacities(capacity, bytes_capacity),
capacity,
field: Field::new(name, DataType::Utf8),
field: Field::new(name, DataType::String),
}
}

Expand All @@ -39,7 +39,7 @@ impl Utf8ChunkedBuilder {
self.builder.push(opt);
}

pub fn finish(mut self) -> Utf8Chunked {
pub fn finish(mut self) -> StringChunked {
let arr = self.builder.as_box();

let mut ca = ChunkedArray {
Expand All @@ -59,19 +59,19 @@ impl Utf8ChunkedBuilder {
}
}

pub struct Utf8ChunkedBuilderCow {
builder: Utf8ChunkedBuilder,
pub struct StringChunkedBuilderCow {
builder: StringChunkedBuilder,
}

impl Utf8ChunkedBuilderCow {
impl StringChunkedBuilderCow {
pub fn new(name: &str, capacity: usize) -> Self {
Utf8ChunkedBuilderCow {
builder: Utf8ChunkedBuilder::new(name, capacity, capacity),
StringChunkedBuilderCow {
builder: StringChunkedBuilder::new(name, capacity, capacity),
}
}
}

impl ChunkedBuilder<Cow<'_, str>, Utf8Type> for Utf8ChunkedBuilderCow {
impl ChunkedBuilder<Cow<'_, str>, StringType> for StringChunkedBuilderCow {
#[inline]
fn append_value(&mut self, val: Cow<'_, str>) {
self.builder.append_value(val.as_ref())
Expand All @@ -82,7 +82,7 @@ impl ChunkedBuilder<Cow<'_, str>, Utf8Type> for Utf8ChunkedBuilderCow {
self.builder.append_null()
}

fn finish(self) -> ChunkedArray<Utf8Type> {
fn finish(self) -> ChunkedArray<StringType> {
self.builder.finish()
}

Expand Down
42 changes: 23 additions & 19 deletions crates/polars-core/src/chunked_array/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ where
}
}

impl ChunkCast for Utf8Chunked {
impl ChunkCast for StringChunked {
fn cast(&self, data_type: &DataType) -> PolarsResult<Series> {
match data_type {
#[cfg(feature = "dtype-categorical")]
Expand All @@ -210,12 +210,16 @@ impl ChunkCast for Utf8Chunked {
},
Some(rev_map) => {
polars_ensure!(rev_map.is_enum(), InvalidOperation: "casting to a non-enum variant with rev map is not supported for the user");
CategoricalChunked::from_utf8_to_enum(self, rev_map.get_categories(), *ordering)
.map(|ca| {
let mut s = ca.into_series();
s.rename(self.name());
s
})
CategoricalChunked::from_string_to_enum(
self,
rev_map.get_categories(),
*ordering,
)
.map(|ca| {
let mut s = ca.into_series();
s.rename(self.name());
s
})
},
},
#[cfg(feature = "dtype-struct")]
Expand Down Expand Up @@ -284,18 +288,18 @@ unsafe fn binary_to_utf8_unchecked(from: &BinaryArray<i64>) -> Utf8Array<i64> {

impl BinaryChunked {
/// # Safety
/// Utf8 is not validated
pub unsafe fn to_utf8(&self) -> Utf8Chunked {
/// String is not validated
pub unsafe fn to_string(&self) -> StringChunked {
let chunks = self
.downcast_iter()
.map(|arr| Box::new(binary_to_utf8_unchecked(arr)) as ArrayRef)
.collect();
let field = Arc::new(Field::new(self.name(), DataType::Utf8));
Utf8Chunked::from_chunks_and_metadata(chunks, field, self.bit_settings, true, true)
let field = Arc::new(Field::new(self.name(), DataType::String));
StringChunked::from_chunks_and_metadata(chunks, field, self.bit_settings, true, true)
}
}

impl Utf8Chunked {
impl StringChunked {
pub fn as_binary(&self) -> BinaryChunked {
let chunks = self
.downcast_iter()
Expand Down Expand Up @@ -324,13 +328,13 @@ impl ChunkCast for BinaryChunked {

unsafe fn cast_unchecked(&self, data_type: &DataType) -> PolarsResult<Series> {
match data_type {
DataType::Utf8 => unsafe { Ok(self.to_utf8().into_series()) },
DataType::String => unsafe { Ok(self.to_string().into_series()) },
_ => self.cast(data_type),
}
}
}

fn boolean_to_utf8(ca: &BooleanChunked) -> Utf8Chunked {
fn boolean_to_string(ca: &BooleanChunked) -> StringChunked {
ca.into_iter()
.map(|opt_b| match opt_b {
Some(true) => Some("true"),
Expand All @@ -343,8 +347,8 @@ fn boolean_to_utf8(ca: &BooleanChunked) -> Utf8Chunked {
impl ChunkCast for BooleanChunked {
fn cast(&self, data_type: &DataType) -> PolarsResult<Series> {
match data_type {
DataType::Utf8 => {
let mut ca = boolean_to_utf8(self);
DataType::String => {
let mut ca = boolean_to_string(self);
ca.rename(self.name());
Ok(ca.into_series())
},
Expand All @@ -369,7 +373,7 @@ impl ChunkCast for ListChunked {
match (self.inner_dtype(), &**child_type) {
#[cfg(feature = "dtype-categorical")]
(dt, Categorical(None, _))
if !matches!(dt, Categorical(_, _) | Utf8 | Null) =>
if !matches!(dt, Categorical(_, _) | String | Null) =>
{
polars_bail!(ComputeError: "cannot cast List inner type: '{:?}' to Categorical", dt)
},
Expand Down Expand Up @@ -423,7 +427,7 @@ impl ChunkCast for ArrayChunked {
Array(child_type, width) => {
match (self.inner_dtype(), &**child_type) {
#[cfg(feature = "dtype-categorical")]
(dt, Categorical(None, _)) if !matches!(dt, Utf8) => {
(dt, Categorical(None, _)) if !matches!(dt, String) => {
polars_bail!(ComputeError: "cannot cast fixed-size-list inner type: '{:?}' to Categorical", dt)
},
_ => {
Expand Down Expand Up @@ -557,7 +561,7 @@ mod test {
#[cfg(feature = "dtype-categorical")]
fn test_cast_noop() {
// check if we can cast categorical twice without panic
let ca = Utf8Chunked::new("foo", &["bar", "ham"]);
let ca = StringChunked::new("foo", &["bar", "ham"]);
let out = ca
.cast(&DataType::Categorical(None, Default::default()))
.unwrap();
Expand Down
Loading