-
Notifications
You must be signed in to change notification settings - Fork 871
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Avoid DataFrame conversion in MultiIndex.from_pandas
#14470
Changes from 4 commits
64faed1
8fee3a8
5bc08b5
2a69ebe
c50f3c6
ee528c8
ea2e5e7
01f51f9
a22c904
1ef937e
e02edc3
3188454
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,6 @@ | |
from pandas._config import get_option | ||
|
||
import cudf | ||
from cudf import _lib as libcudf | ||
from cudf._typing import DataFrameOrSeries | ||
from cudf.api.extensions import no_default | ||
from cudf.api.types import is_integer, is_list_like, is_object_dtype | ||
|
@@ -141,7 +140,9 @@ def __init__( | |
if copy: | ||
if isinstance(codes, cudf.DataFrame): | ||
codes = codes.copy(deep=True) | ||
if len(levels) > 0 and isinstance(levels[0], cudf.Series): | ||
if len(levels) > 0 and isinstance( | ||
levels[0], (cudf.Index, cudf.Series) | ||
): | ||
levels = [level.copy(deep=True) for level in levels] | ||
|
||
if not isinstance(codes, cudf.DataFrame): | ||
|
@@ -158,7 +159,7 @@ def __init__( | |
"codes and is inconsistent!" | ||
) | ||
|
||
levels = [cudf.Series(level) for level in levels] | ||
levels = [cudf.Index(level) for level in levels] | ||
|
||
if len(levels) != len(codes._data): | ||
raise ValueError( | ||
|
@@ -178,18 +179,12 @@ def __init__( | |
) | ||
|
||
source_data = {} | ||
for i, (column_name, col) in enumerate(codes._data.items()): | ||
if -1 in col: | ||
level = cudf.DataFrame( | ||
{column_name: [None] + list(levels[i])}, | ||
index=range(-1, len(levels[i])), | ||
) | ||
else: | ||
level = cudf.DataFrame({column_name: levels[i]}) | ||
|
||
source_data[column_name] = libcudf.copying.gather( | ||
[level._data[column_name]], col | ||
)[0] | ||
for (column_name, col), level in zip(codes._data.items(), levels): | ||
result_col = level._column.take(col) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On line 175 we already do partial bounds-checking. The treatment of But let us move all of the validation into one place:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice find. Was able to consolidate the codes validate in this loop |
||
mask = col == -1 | ||
if mask.any(): | ||
result_col[mask] = cudf.NA | ||
source_data[column_name] = result_col | ||
|
||
super().__init__(source_data) | ||
self._levels = levels | ||
|
@@ -481,7 +476,8 @@ def copy( | |
|
||
mi = MultiIndex._from_data(self._data.copy(deep=deep)) | ||
if self._levels is not None: | ||
mi._levels = [s.copy(deep) for s in self._levels] | ||
# TODO: why is deep=True not being respected here | ||
mi._levels = [idx.copy(deep) for idx in self._levels] | ||
wence- marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if self._codes is not None: | ||
mi._codes = self._codes.copy(deep) | ||
if names is not None: | ||
|
@@ -1625,19 +1621,13 @@ def from_pandas(cls, multiindex, nan_as_null=no_default): | |
nan_as_null = ( | ||
False if cudf.get_option("mode.pandas_compatible") else None | ||
) | ||
|
||
# if `multiindex` has two or more levels that | ||
# have the same name, then `multiindex.to_frame()` | ||
# results in a DataFrame containing only one of those | ||
# levels. Thus, set `names` to some tuple of unique values | ||
# and then call `multiindex.to_frame(name=names)`, | ||
# which preserves all levels of `multiindex`. | ||
names = tuple(range(len(multiindex.names))) | ||
|
||
df = cudf.DataFrame.from_pandas( | ||
multiindex.to_frame(index=False, name=names), nan_as_null | ||
levels = [ | ||
cudf.Series(level, nan_as_null=nan_as_null) | ||
for level in multiindex.levels | ||
] | ||
return cls( | ||
levels=levels, codes=multiindex.codes, names=multiindex.names | ||
) | ||
return cls.from_frame(df, names=multiindex.names) | ||
|
||
@cached_property # type: ignore | ||
@_cudf_nvtx_annotate | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW, I am not sure that
codes
can benp.int64
it should probably be size_type_dtype. But let us not fix that here.