Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor ColumnMethods and its subclasses to remove column argument and require parent argument #8306

Merged
merged 18 commits into from
Jul 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
from cudf._lib.nvtext.generate_ngrams import (
generate_character_ngrams,
generate_ngrams,
)
from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
from cudf._lib.nvtext.stemmer import (
LetterType,
is_letter,
is_letter_multi,
porter_stemmer_measure,
)
from cudf._lib.nvtext.subword_tokenize import subword_tokenize_vocab_file
from cudf._lib.nvtext.tokenize import (
_count_tokens_column,
_count_tokens_scalar,
_tokenize_column,
_tokenize_scalar,
character_tokenize,
detokenize,
)
from cudf._lib.strings.attributes import (
code_points,
count_bytes,
count_characters,
)
from cudf._lib.strings.capitalize import capitalize, title
from cudf._lib.strings.case import swapcase, to_lower, to_upper
from cudf._lib.strings.char_types import (
filter_alphanum,
is_alnum,
is_alpha,
is_decimal,
is_digit,
is_lower,
is_numeric,
is_space,
is_upper,
)
from cudf._lib.strings.combine import (
concatenate,
join,
join_lists_with_column,
join_lists_with_scalar,
)
from cudf._lib.strings.contains import contains_re, count_re, match_re
from cudf._lib.strings.convert.convert_fixed_point import to_decimal
from cudf._lib.strings.convert.convert_floats import is_float
from cudf._lib.strings.convert.convert_integers import is_integer
from cudf._lib.strings.convert.convert_urls import url_decode, url_encode
from cudf._lib.strings.extract import extract
from cudf._lib.strings.find import (
contains,
contains_multiple,
endswith,
endswith_multiple,
find,
rfind,
startswith,
startswith_multiple,
)
from cudf._lib.strings.findall import findall
from cudf._lib.strings.json import get_json_object
from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill
from cudf._lib.strings.replace import (
insert,
replace,
replace_multi,
slice_replace,
)
from cudf._lib.strings.replace_re import (
replace_multi_re,
replace_re,
replace_with_backrefs,
)
from cudf._lib.strings.split.partition import partition, rpartition
from cudf._lib.strings.split.split import (
rsplit,
rsplit_record,
split,
split_record,
)
from cudf._lib.strings.strip import lstrip, rstrip, strip
from cudf._lib.strings.substring import get, slice_from, slice_strings
from cudf._lib.strings.translate import filter_characters, translate
from cudf._lib.strings.wrap import wrap
24 changes: 12 additions & 12 deletions python/cudf/cudf/_lib/strings/combine.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@ from cudf._lib.cpp.strings.combine cimport (


def concatenate(Table source_strings,
object py_separator,
object py_narep):
object sep,
object na_rep):
"""
Returns a Column by concatenating strings column-wise in `source_strings`
with the specified `py_separator` between each column and
`na`/`None` values are replaced by `py_narep`
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""
cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value
cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef table_view source_view = source_strings.data_view()
Expand All @@ -53,16 +53,16 @@ def concatenate(Table source_strings,


def join(Column source_strings,
object py_separator,
object py_narep):
object sep,
object na_rep):
"""
Returns a Column by concatenating strings row-wise in `source_strings`
with the specified `py_separator` between each column and
`na`/`None` values are replaced by `py_narep`
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""

cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value
cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/_lib/transpose.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,10 @@ def transpose(Table source):
if is_categorical_dtype(dtype):
if any(not is_categorical_dtype(c.dtype) for c in source._columns):
raise ValueError('Columns must all have the same dtype')
cats = list(c.cat().categories for c in source._columns)
cats = cudf.Series(cudf.concat(cats)).drop_duplicates()._column
cats = list(c.categories for c in source._columns)
cats = cudf.core.column.concat_columns(cats).unique()
source = Table(index=source._index, data=[
(name, col.cat()._set_categories(
col.cat().categories, cats, is_unique=True).codes)
(name, col._set_categories(cats, is_unique=True).codes)
for name, col in source._data.items()
])
elif dtype.kind in 'OU':
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@
BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"]

DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"]
SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"]
2 changes: 1 addition & 1 deletion python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def _union_categoricals(
sorted_categories = result_col.categories.sort_by_values(
ascending=True
)[0]
result_col = result_col.cat().reorder_categories(
result_col = result_col.reorder_categories(
new_categories=sorted_categories
)

Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
"""
isort: skip_file
"""
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved


from cudf.core.column.categorical import CategoricalColumn
from cudf.core.column.column import (
Expand All @@ -12,6 +16,7 @@
column_empty,
column_empty_like,
column_empty_like_same_mask,
concat_columns,
deserialize_columns,
full,
serialize_columns,
Expand All @@ -27,3 +32,4 @@
Decimal32Column,
Decimal64Column,
)
from cudf.core.column.interval import IntervalColumn # noqa: F401
Loading