Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disallow constructing frames from a ColumnAccessor #7298

Merged
merged 39 commits into from
Feb 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
4a4b4af
Merge branch 'branch-0.17' into branch-0.18
shwina Dec 11, 2020
223f2b5
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Dec 15, 2020
abd6ad2
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Dec 17, 2020
18863b5
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Jan 4, 2021
0fbdd31
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Jan 5, 2021
dc9b943
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Jan 5, 2021
d586aa7
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Jan 7, 2021
996fda8
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Jan 8, 2021
7c9ac23
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Jan 15, 2021
8ae778a
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Jan 21, 2021
d23b8b8
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Jan 26, 2021
9a0db21
bpMerge branch 'branch-0.18' of https://github.com/rapidsai/cudf into…
shwina Jan 27, 2021
b1283e3
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Jan 29, 2021
ed4b022
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Feb 1, 2021
228221e
Remove ColumnAccessor from ctor
shwina Feb 1, 2021
833c1ab
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Feb 1, 2021
1f17c3d
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into b…
shwina Feb 2, 2021
5557cb8
Merge branch 'branch-0.18' into fix-ca-constructor
shwina Feb 2, 2021
e03e505
Fix style issues related to NumPy
shwina Feb 2, 2021
d6fa28c
Changelog
shwina Feb 2, 2021
fde8070
Update copyright year
shwina Feb 2, 2021
753fad0
Raise error on fill_value being None
shwina Feb 2, 2021
306c4db
Add back type info for size
shwina Feb 2, 2021
843b5b5
Add assertions for size
shwina Feb 2, 2021
ea79e4d
Replace assertions
shwina Feb 2, 2021
1015e78
Add TODOs to make size non-optional
shwina Feb 2, 2021
49dad6c
Use a single Frame.copy(), identify other places we were calling the
shwina Feb 3, 2021
fead58d
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into f…
shwina Feb 3, 2021
6533af6
Fix array construction
shwina Feb 3, 2021
56d4edc
Enforce numpy < 1.20
shwina Feb 3, 2021
872761f
update gpu/build.sh
shwina Feb 3, 2021
99ebec8
Merge branch 'fix-numpy-style-issues' into fix-ca-constructor
shwina Feb 3, 2021
bcbefae
Merge branch 'branch-0.18' of https://github.com/rapidsai/cudf into f…
shwina Feb 3, 2021
9c1b72a
Add _from_data
shwina Feb 3, 2021
037dc41
More _from_data
shwina Feb 3, 2021
e482725
Update python/cudf/cudf/core/dataframe.py
shwina Feb 4, 2021
ed1e58c
Update python/cudf/cudf/core/frame.py
galipremsagar Feb 4, 2021
8d23f48
Merge branch 'branch-0.18' into fix-ca-constructor
galipremsagar Feb 4, 2021
9df461a
Update frame.py
galipremsagar Feb 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 18 additions & 34 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,20 +207,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
if isinstance(columns, (Series, cudf.Index)):
columns = columns.to_pandas()

if isinstance(data, ColumnAccessor):
if index is None:
index = as_index(range(data.nrows))
else:
index = as_index(index)
self._index = index

if columns is not None:
self._data = data
self._reindex(columns=columns, deep=True, inplace=True)
else:
self._data = data

elif isinstance(data, (DataFrame, pd.DataFrame)):
if isinstance(data, (DataFrame, pd.DataFrame)):
if isinstance(data, pd.DataFrame):
data = self.from_pandas(data)

Expand Down Expand Up @@ -250,7 +237,6 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
else:
self._index = as_index(index)
if columns is not None:

self._data = ColumnAccessor(
OrderedDict.fromkeys(
columns,
Expand Down Expand Up @@ -502,6 +488,17 @@ def _from_table(cls, table, index=None):
out._index = index
return out

@classmethod
def _from_data(cls, data, index=None, columns=None):
out = cls.__new__(cls)
out._data = data
if index is None:
index = cudf.Index(range(data.nrows))
out._index = index
if columns is not None:
out.columns = columns
return out

@staticmethod
def _align_input_series_indices(data, index):
data = data.copy()
Expand Down Expand Up @@ -1332,12 +1329,14 @@ def _get_columns_by_label(self, labels, downcast=False):
elif isinstance(labels, tuple):
nlevels = len(labels)
if self._data.multiindex is False or nlevels == self._data.nlevels:
return self._constructor_sliced(
new_data, name=labels, index=self.index
out = self._constructor_sliced()._from_data(
new_data, index=self.index, name=labels
)
return self._constructor(
new_data, columns=new_data.to_pandas_index(), index=self.index
return out
out = self._constructor()._from_data(
new_data, index=self.index, columns=new_data.to_pandas_index()
)
return out

# unary, binary, rbinary, orderedcompare, unorderedcompare
def _apply_op(self, fn, other=None, fill_value=None):
Expand Down Expand Up @@ -3036,21 +3035,6 @@ def take(self, positions, keep_index=True):
out.columns = self.columns
return out

@annotate("DATAFRAME_COPY", color="cyan", domain="cudf_python")
def copy(self, deep=True):
"""
Returns a copy of this dataframe

Parameters
----------
deep: bool
Make a full copy of Series columns and Index at the GPU level, or
create a new allocation with references.
"""
out = DataFrame(data=self._data.copy(deep=deep))
out.index = self.index.copy(deep=deep)
return out

def __copy__(self):
return self.copy(deep=True)

Expand Down
90 changes: 87 additions & 3 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import operator
import warnings
from collections import OrderedDict, abc as abc
from typing import TYPE_CHECKING, Any, Dict, Tuple, overload
from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload

import cupy
import numpy as np
Expand All @@ -27,6 +27,9 @@
min_scalar_type,
)


T = TypeVar("T", bound="Frame")

if TYPE_CHECKING:
from cudf.core.column_accessor import ColumnAccessor

Expand Down Expand Up @@ -214,8 +217,89 @@ def empty(self):
def __len__(self):
return self._num_rows

def copy(self, deep=True):
return Frame(self._data.copy(deep=deep))
def copy(self: T, deep: bool = True) -> T:
"""
Make a copy of this object's indices and data.

When ``deep=True`` (default), a new object will be created with a
copy of the calling object's data and indices. Modifications to
the data or indices of the copy will not be reflected in the
original object (see notes below).
When ``deep=False``, a new object will be created without copying
the calling object's data or index (only references to the data
and index are copied). Any changes to the data of the original
will be reflected in the shallow copy (and vice versa).

Parameters
----------
deep : bool, default True
Make a deep copy, including a copy of the data and the indices.
With ``deep=False`` neither the indices nor the data are copied.

Returns
-------
copy : Series or DataFrame
Object type matches caller.
Examples
--------
>>> s = cudf.Series([1, 2], index=["a", "b"])
>>> s
a 1
b 2
dtype: int64
>>> s_copy = s.copy()
>>> s_copy
a 1
b 2
dtype: int64

**Shallow copy versus default (deep) copy:**

>>> s = cudf.Series([1, 2], index=["a", "b"])
>>> deep = s.copy()
>>> shallow = s.copy(deep=False)

Shallow copy shares data and index with original.

>>> s is shallow
False
>>> s._column is shallow._column and s.index is shallow.index
True

Deep copy has own copy of data and index.

>>> s is deep
False
>>> s.values is deep.values or s.index is deep.index
False

Updates to the data shared by shallow copy and original is reflected
in both; deep copy remains unchanged.

>>> s['a'] = 3
>>> shallow['b'] = 4
>>> s
a 3
b 4
dtype: int64
>>> shallow
a 3
b 4
dtype: int64
>>> deep
a 1
b 2
dtype: int64
"""
new_frame = self.__class__.__new__(type(self))
new_frame._data = self._data.copy(deep=deep)

if self._index is not None:
new_frame._index = self._index.copy(deep=deep)
else:
new_frame._index = None

return new_frame

@classmethod
@annotate("CONCAT", color="orange", domain="cudf_python")
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,8 @@ def shape(self):

@property
def _source_data(self):
return cudf.DataFrame(self._data)
out = cudf.DataFrame._from_data(data=self._data)
return out

@_source_data.setter
def _source_data(self, value):
Expand Down Expand Up @@ -452,7 +453,7 @@ def __repr__(self):
)
preprocess = self.take(indices)
else:
preprocess = self
preprocess = self.copy(deep=False)

cols_nulls = [
preprocess._source_data._data[col].has_nulls
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,10 +831,10 @@ def as_tuple(x):
)
)._data
)

return cudf.DataFrame(
out = cudf.DataFrame._from_data(
result, index=cudf.Index(index_labels, name=index.name)
)
return out


def pivot(data, index=None, columns=None, values=None):
Expand Down
102 changes: 19 additions & 83 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
from __future__ import annotations

import pickle
import warnings
from collections import abc as abc
from numbers import Number
from shutil import get_terminal_size
from typing import Any, Set
from typing import Any, Optional, Set
from uuid import uuid4

import cupy
Expand Down Expand Up @@ -216,6 +217,23 @@ def _from_table(cls, table, index=None):
index = Index._from_table(table._index)
return cls(data=data, index=index, name=name)

@classmethod
def _from_data(
cls,
data: ColumnAccessor,
index: Optional[Index] = None,
name: Any = None,
) -> Series:
"""
Construct the Series from a ColumnAccessor
"""
out = cls.__new__(cls)
out._data = data
out._index = index if index is not None else RangeIndex(data.nrows)
if name is not None:
out.name = name
return out

@property
def _column(self):
return self._data[self.name]
Expand Down Expand Up @@ -472,88 +490,6 @@ def to_arrow(self):
"""
return self._column.to_arrow()

def copy(self, deep=True):
"""
Make a copy of this object's indices and data.

When ``deep=True`` (default), a new object will be created with a
copy of the calling object's data and indices. Modifications to
the data or indices of the copy will not be reflected in the
original object (see notes below).
When ``deep=False``, a new object will be created without copying
the calling object's data or index (only references to the data
and index are copied). Any changes to the data of the original
will be reflected in the shallow copy (and vice versa).

Parameters
----------
deep : bool, default True
Make a deep copy, including a copy of the data and the indices.
With ``deep=False`` neither the indices nor the data are copied.

Returns
-------
copy : Series or DataFrame
Object type matches caller.


Examples
--------
>>> s = cudf.Series([1, 2], index=["a", "b"])
>>> s
a 1
b 2
dtype: int64
>>> s_copy = s.copy()
>>> s_copy
a 1
b 2
dtype: int64

**Shallow copy versus default (deep) copy:**

>>> s = cudf.Series([1, 2], index=["a", "b"])
>>> deep = s.copy()
>>> shallow = s.copy(deep=False)

Shallow copy shares data and index with original.

>>> s is shallow
False
>>> s._column is shallow._column and s.index is shallow.index
True

Deep copy has own copy of data and index.

>>> s is deep
False
>>> s.values is deep.values or s.index is deep.index
False

Updates to the data shared by shallow copy and original is reflected
in both; deep copy remains unchanged.

>>> s['a'] = 3
>>> shallow['b'] = 4
>>> s
a 3
b 4
dtype: int64
>>> shallow
a 3
b 4
dtype: int64
>>> deep
a 1
b 2
dtype: int64
"""
result = self._copy_construct()
if deep:
result._column = self._column.copy(deep=deep)
result.index = self.index.copy(deep=deep)
return result

def __copy__(self, deep=True):
return self.copy(deep)

Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/tests/test_cuda_array_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,11 @@ def test_cuda_array_interface_pytorch():

with pytest.raises(TypeError):
cat_series.__cuda_array_interface__


def test_cai_after_indexing():
df = cudf.DataFrame({"a": [1, 2, 3]})
cai1 = df["a"].__cuda_array_interface__
df[["a"]]
cai2 = df["a"].__cuda_array_interface__
assert cai1 == cai2
9 changes: 0 additions & 9 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8057,15 +8057,6 @@ def assert_local_eq(actual, df, expected, host_columns):

assert_local_eq(actual, df, expected, host_columns)

expected = pd.DataFrame(df, columns=host_columns)
actual = gd.DataFrame(gdf._data, columns=columns, index=index)
if index is not None:
if df.shape == (0, 0):
expected = pd.DataFrame(columns=host_columns, index=index)
else:
expected.index = index
assert_local_eq(actual, df, expected, host_columns)


@pytest.mark.parametrize(
"data",
Expand Down