Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor Python factories and remove usage of Table for libcudf output handling #8687

Merged
merged 56 commits into from
Aug 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
922bf03
Redefine DataFrame._from_table in terms of DataFrame._from_data.
vyasr Jun 25, 2021
f85be21
Redefine Series._from_table in terms of Series._from_data.
vyasr Jun 26, 2021
025ffa1
Add new API for getting columns from a unique ptr.
vyasr Jul 1, 2021
d68fa95
Use new API to return list of columns from libcudf, write new code to…
vyasr Jul 2, 2021
6b47727
Move conversion function to utils.
vyasr Jul 2, 2021
df49647
Combine utility functions into one.
vyasr Jul 6, 2021
0439935
Convert explode to new API.
vyasr Jul 6, 2021
a9a52b6
Convert gather to new API.
vyasr Jul 6, 2021
c684d17
Convert hash to new API.
vyasr Jul 6, 2021
c3add5b
Convert scatter to new API.
vyasr Jul 6, 2021
529385b
Convert empty_like and partition.
vyasr Jul 6, 2021
27b99ad
Convert more APIs.
vyasr Jul 6, 2021
abe12fd
Remove remaining from_unique_ptr calls that pass up to frame.py.
vyasr Jul 6, 2021
012d6fe
Change read_parquet.
vyasr Jul 6, 2021
98a4623
Change string partitioning (but still wrapped in a Frame in Python fo…
vyasr Jul 6, 2021
9f36fa4
Change string splitting (but still wrapped in a Frame in Python for t…
vyasr Jul 6, 2021
71e5dae
Change string extract and findall.
vyasr Jul 6, 2021
da00ee1
Change merge_sorted.
vyasr Jul 6, 2021
48dbdf4
Update interop.
vyasr Jul 6, 2021
0d8c719
Update avro and json.
vyasr Jul 6, 2021
b47edb2
Switch Groupby.groups.
vyasr Jul 6, 2021
986516a
Update orc.
vyasr Jul 7, 2021
ae2e731
Update groupby shift.
vyasr Jul 7, 2021
ffe520c
Update groupby replace_nulls.
vyasr Jul 7, 2021
5c6f254
Update groupby aggregate.
vyasr Jul 7, 2021
8af8ec9
Update csv.
vyasr Jul 7, 2021
c4f7cb5
Remove from_unique_ptr.
vyasr Jul 7, 2021
9e60845
Reduce usage of _from_table in frame.py.
vyasr Jul 7, 2021
321071d
Remove unnecessary test.
vyasr Jul 7, 2021
9e1ab29
Add new function for data from table view.
vyasr Jul 7, 2021
e0a39c5
Remove from_table_view from copying.pyx and fix calls.
vyasr Jul 7, 2021
ffb74f3
Remove usage of Table.from_table_view.
vyasr Jul 7, 2021
c8dac36
Remove from_table_view.
vyasr Jul 7, 2021
faac839
Rewrite _from_table in terms of _from_data for Index types.
vyasr Jul 7, 2021
72eb1d9
Remove most uses of _from_table.
vyasr Jul 7, 2021
74025bc
Inline ColumnBase.scatter_to_column in the only place it is used (whi…
vyasr Jul 7, 2021
45748ad
Remove last usage of _from_table outside of tests.
vyasr Jul 7, 2021
567c2fc
Remove use of _from_table from test_pack.
vyasr Jul 7, 2021
401b47c
Remove _from_table.
vyasr Jul 7, 2021
d6b8daf
Standardize from_data implementations further.
vyasr Jul 8, 2021
ab65265
Remove unnecessary ColumnAccessor conversions in Python code.
vyasr Jul 8, 2021
57c044e
Remove unnecessary ColumnAccessor conversions in Cython code.
vyasr Jul 8, 2021
ab91dd4
Remove now outdated TODO.
vyasr Jul 8, 2021
5dbb3c7
Fix style issues.
vyasr Jul 8, 2021
ed77ce9
Try moving ignore to see if that makes mypy happy.
vyasr Jul 8, 2021
15d7337
Simplify groupby APIs by constructing Indexes at the call site.
vyasr Jul 8, 2021
b45965d
Fix categorical index metadata copying.
vyasr Jul 8, 2021
2a58ebe
Merge remote-tracking branch 'origin/branch-21.08' into refactor/fact…
vyasr Jul 15, 2021
1707199
Merge remote-tracking branch 'origin/branch-21.10' into refactor/fact…
vyasr Jul 15, 2021
885f74a
Merge remote-tracking branch 'origin/branch-21.10' into refactor/fact…
vyasr Jul 20, 2021
1bcc059
Merge remote-tracking branch 'origin/branch-21.10' into refactor/fact…
vyasr Jul 26, 2021
84f2e35
Fix bug in generating a table from a scalar.
vyasr Jul 26, 2021
7d50fa4
Make TODO more informative.
vyasr Jul 26, 2021
944162d
Remove TODOs.
vyasr Jul 26, 2021
8cb5845
Improve docstring and clean up TODOs.
vyasr Jul 26, 2021
a2fd254
Update python/cudf/cudf/_lib/utils.pyx
vyasr Jul 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/avro.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ from cudf._lib.cpp.io.types cimport table_with_metadata
from cudf._lib.cpp.types cimport size_type
from cudf._lib.io.utils cimport make_source_info
from cudf._lib.table cimport Table
from cudf._lib.utils cimport data_from_unique_ptr


cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
Expand Down Expand Up @@ -52,4 +53,4 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):

names = [name.decode() for name in c_result.metadata.column_names]

return Table.from_unique_ptr(move(c_result.tbl), column_names=names)
return data_from_unique_ptr(move(c_result.tbl), column_names=names)
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/concat.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ from cudf._lib.cpp.concatenate cimport (
from cudf._lib.cpp.table.table cimport table, table_view
from cudf._lib.table cimport Table
from cudf._lib.utils cimport (
data_from_unique_ptr,
make_column_views,
make_table_data_views,
make_table_views,
Expand Down Expand Up @@ -52,7 +53,8 @@ cpdef concat_tables(object tables, bool ignore_index=False):
c_views = make_table_data_views(tables)
with nogil:
c_result = move(libcudf_concatenate_tables(c_views))
return Table.from_unique_ptr(

return data_from_unique_ptr(
move(c_result),
column_names=tables[0]._column_names,
index_names=None if ignore_index else tables[0]._index_names
Expand Down
44 changes: 20 additions & 24 deletions python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ from libcpp.utility cimport move
from libcpp.vector cimport vector

from rmm._lib.device_buffer cimport DeviceBuffer

from cudf.core.buffer import Buffer

from cudf._lib.column cimport Column
Expand All @@ -35,6 +36,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.utils cimport data_from_table_view, data_from_unique_ptr

# workaround for https://github.com/cython/cython/issues/3885
ctypedef const scalar constscalar
Expand Down Expand Up @@ -178,7 +180,7 @@ def gather(
)
)

return Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=(
Expand Down Expand Up @@ -210,19 +212,17 @@ def _scatter_table(Table source_table, Column scatter_map,
)
)

out_table = Table.from_unique_ptr(
data, _ = data_from_unique_ptr(
move(c_result),
column_names=target_table._column_names,
index_names=None
)

out_table._index = (
return data, (
None if target_table._index is None else target_table._index.copy(
deep=False)
)

return out_table


def _scatter_scalar(scalars, Column scatter_map,
Table target_table, bool bounds_check=True):
Expand Down Expand Up @@ -250,19 +250,17 @@ def _scatter_scalar(scalars, Column scatter_map,
)
)

out_table = Table.from_unique_ptr(
data, _ = data_from_unique_ptr(
move(c_result),
column_names=target_table._column_names,
index_names=None
)

out_table._index = (
return data, (
None if target_table._index is None else target_table._index.copy(
deep=False)
)

return out_table


def scatter(object input, object scatter_map, Table target,
bool bounds_check=True):
Expand Down Expand Up @@ -306,7 +304,7 @@ def _reverse_table(Table source_table):
reverse_table_view
))

return Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=source_table._index_names
Expand Down Expand Up @@ -371,7 +369,7 @@ def table_empty_like(Table input_table, bool keep_index=True):
with nogil:
c_result = move(cpp_copying.empty_like(input_table_view))

return Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_result),
column_names=input_table._column_names,
index_names=(
Expand Down Expand Up @@ -434,8 +432,8 @@ def table_slice(Table input_table, object indices, bool keep_index=True):
)

num_of_result_cols = c_result.size()
result =[
Table.from_table_view(
return [
data_from_table_view(
c_result[i],
input_table,
column_names=input_table._column_names,
Expand All @@ -446,8 +444,6 @@ def table_slice(Table input_table, object indices, bool keep_index=True):
)
) for i in range(num_of_result_cols)]

return result


def column_split(Column input_column, object splits):

Expand Down Expand Up @@ -505,8 +501,8 @@ def table_split(Table input_table, object splits, bool keep_index=True):
)

num_of_result_cols = c_result.size()
result = [
Table.from_table_view(
return [
data_from_table_view(
c_result[i],
input_table,
column_names=input_table._column_names,
Expand All @@ -515,8 +511,6 @@ def table_split(Table input_table, object splits, bool keep_index=True):
else None
) for i in range(num_of_result_cols)]

return result


def _copy_if_else_column_column(Column lhs, Column rhs, Column boolean_mask):

Expand Down Expand Up @@ -642,7 +636,7 @@ def _boolean_mask_scatter_table(Table input_table, Table target_table,
)
)

return Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_result),
column_names=target_table._column_names,
index_names=target_table._index._column_names
Expand Down Expand Up @@ -672,13 +666,15 @@ def _boolean_mask_scatter_scalar(list input_scalars, Table target_table,
)
)

return Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_result),
column_names=target_table._column_names,
index_names=target_table._index._column_names
)


# TODO: This function is currently unused but should be used in
# ColumnBase.__setitem__, see https://github.com/rapidsai/cudf/issues/8667.
def boolean_mask_scatter(object input, Table target_table,
Column boolean_mask):

Expand Down Expand Up @@ -755,7 +751,7 @@ def sample(Table input, size_type n,
cpp_copying.sample(tbl_view, n, replacement, seed)
)

return Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_output),
column_names=input._column_names,
index_names=(
Expand Down Expand Up @@ -884,12 +880,12 @@ cdef class _CPackedColumns:
return p

def unpack(self):
output_table = Table.from_table_view(
output_table = Table(*data_from_table_view(
cpp_copying.unpack(self.c_obj),
self,
self.column_names,
self.index_names
)
))

for name, dtype in self.column_dtypes.items():
output_table._data[name] = (
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ from cudf._lib.cpp.io.types cimport (
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.io.utils cimport make_sink_info, make_source_info
from cudf._lib.table cimport Table, make_table_view
from cudf._lib.utils cimport data_from_unique_ptr

ctypedef int32_t underlying_type_t_compression

Expand Down Expand Up @@ -393,7 +394,7 @@ def read_csv(
c_result = move(cpp_read_csv(read_csv_options_c))

meta_names = [name.decode() for name in c_result.metadata.column_names]
df = cudf.DataFrame._from_table(Table.from_unique_ptr(
df = cudf.DataFrame._from_data(*data_from_unique_ptr(
move(c_result.tbl),
column_names=meta_names
))
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/_lib/filling.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.table cimport Table
from cudf._lib.utils cimport data_from_unique_ptr


def fill_in_place(Column destination, int begin, int end, DeviceScalar value):
Expand Down Expand Up @@ -70,7 +71,7 @@ def _repeat_via_column(Table inp, Column count, bool check_count):
c_check_count
))

return Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_result),
column_names=inp._column_names,
index_names=inp._index_names
Expand All @@ -87,7 +88,7 @@ def _repeat_via_size_type(Table inp, size_type count):
count
))

return Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_result),
column_names=inp._column_names,
index_names=inp._index_names
Expand Down
27 changes: 14 additions & 13 deletions python/cudf/cudf/_lib/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ from libcpp.pair cimport pair
from libcpp.utility cimport move
from libcpp.vector cimport vector

import cudf

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.table cimport Table
Expand All @@ -39,6 +41,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
from cudf._lib.cpp.table.table cimport table, table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.cpp.utilities.host_span cimport host_span
from cudf._lib.utils cimport data_from_unique_ptr

# The sets below define the possible aggregations that can be performed on
# different dtypes. These strings must be elements of the AggregationKind enum.
Expand Down Expand Up @@ -91,11 +94,11 @@ cdef class GroupBy:
c_grouped_values = move(c_groups.values)
c_group_offsets = c_groups.offsets

grouped_keys = Table.from_unique_ptr(
grouped_keys = cudf.Index._from_data(*data_from_unique_ptr(
move(c_grouped_keys),
column_names=range(c_grouped_keys.get()[0].num_columns())
)
grouped_values = Table.from_unique_ptr(
))
grouped_values = data_from_unique_ptr(
move(c_grouped_values),
index_names=values._index_names,
column_names=values._column_names
Expand Down Expand Up @@ -197,7 +200,7 @@ cdef class GroupBy:
else:
raise

grouped_keys = Table.from_unique_ptr(
grouped_keys, _ = data_from_unique_ptr(
move(c_result.first),
column_names=self.keys._column_names
)
Expand All @@ -213,7 +216,7 @@ cdef class GroupBy:
Column.from_unique_ptr(move(c_result.second[i].results[j]))
)

return Table(data=result_data, index=grouped_keys)
return result_data, cudf.Index._from_data(grouped_keys)

def shift(self, Table values, int periods, list fill_values):
cdef table_view view = values.view()
Expand All @@ -238,16 +241,16 @@ cdef class GroupBy:
self.c_obj.get()[0].shift(view, offsets, c_fill_values)
)

grouped_keys = Table.from_unique_ptr(
grouped_keys = cudf.Index._from_data(*data_from_unique_ptr(
move(c_result.first),
column_names=self.keys._column_names
)
))

shifted = Table.from_unique_ptr(
shifted, _ = data_from_unique_ptr(
move(c_result.second), column_names=values._column_names
)

return Table(data=shifted._data, index=grouped_keys)
return shifted, grouped_keys

def replace_nulls(self, Table values, object method):
cdef table_view val_view = values.view()
Expand All @@ -265,12 +268,10 @@ cdef class GroupBy:
self.c_obj.get()[0].replace_nulls(val_view, policies)
)

grouped_result = Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_result.second), column_names=values._column_names
)
)[0]

result = Table(data=grouped_result._data)
return result

_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"}

Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/_lib/hash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.table cimport Table
from cudf._lib.utils cimport data_from_unique_ptr


def hash_partition(Table source_table, object columns_to_hash,
Expand All @@ -41,12 +42,14 @@ def hash_partition(Table source_table, object columns_to_hash,
# the original table (`source_table`) is empty. We need to
# return a list of zeros in this case.
return (
Table.from_unique_ptr(
*data_from_unique_ptr(
move(c_result.first),
column_names=source_table._column_names,
index_names=source_table._index_names if(
keep_index is True)
else None
index_names=(
source_table._index_names
if keep_index is True
else None
)

),
list(c_result.second) if c_result.second.size()
Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/_lib/interop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ from cudf._lib.cpp.interop cimport (
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.table cimport Table
from cudf._lib.utils cimport data_from_unique_ptr


def from_dlpack(dlpack_capsule):
Expand All @@ -40,7 +41,7 @@ def from_dlpack(dlpack_capsule):
cpp_from_dlpack(dlpack_tensor)
)

res = Table.from_unique_ptr(
res = data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
)
Expand Down Expand Up @@ -164,10 +165,8 @@ def from_arrow(
with nogil:
c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))

out_table = Table.from_unique_ptr(
return data_from_unique_ptr(
move(c_result),
column_names=column_names,
index_names=index_names
)

return out_table
Loading