Skip to content

Commit

Permalink
Fix warnings in remaining modules (#12406)
Browse files Browse the repository at this point in the history
Contributes to #9999 and #10363.

When I merge these changes with #12369 I no longer see any warnings on my machine. I suspect that there will be slightly different results on different machines, so we'll see have to see how CI looks after both PRs are merged before we close #10363.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: #12406
  • Loading branch information
vyasr committed Dec 21, 2022
1 parent b533259 commit 1d81fbb
Show file tree
Hide file tree
Showing 19 changed files with 179 additions and 96 deletions.
9 changes: 5 additions & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -987,10 +987,11 @@ def replace(
raise NotImplementedError("`flags` parameter is not yet supported")

if can_convert_to_column(pat) and can_convert_to_column(repl):
warnings.warn(
"`n` parameter is not supported when "
"`pat` and `repl` are list-like inputs"
)
if n != -1:
warnings.warn(
"`n` parameter is not supported when "
"`pat` and `repl` are list-like inputs"
)

return self._return_or_inplace(
libstrings.replace_multi_re(
Expand Down
20 changes: 10 additions & 10 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2231,18 +2231,18 @@ def update(self, other):
If ``other`` contains NaNs the corresponding values are not updated
in the original Series.
>>> s = cudf.Series([1, 2, 3])
>>> s = cudf.Series([1.0, 2.0, 3.0])
>>> s
0 1
1 2
2 3
dtype: int64
>>> s.update(cudf.Series([4, np.nan, 6], nan_as_null=False))
0 1.0
1 2.0
2 3.0
dtype: float64
>>> s.update(cudf.Series([4.0, np.nan, 6.0], nan_as_null=False))
>>> s
0 4
1 2
2 6
dtype: int64
0 4.0
1 2.0
2 6.0
dtype: float64
``other`` can also be a non-Series object type
that is coercible into a Series
Expand Down
11 changes: 9 additions & 2 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import warnings
from collections import defaultdict
from contextlib import ExitStack
from typing import Dict, List, Tuple
from typing import Dict, List, Optional, Tuple
from uuid import uuid4

from pyarrow import dataset as ds, parquet as pq
Expand Down Expand Up @@ -1010,9 +1010,13 @@ def __init__(
) -> None:
if isinstance(path, str) and path.startswith("s3://"):
self.fs_meta = {"is_s3": True, "actual_path": path}
self.path = tempfile.TemporaryDirectory().name
self.dir_: Optional[
tempfile.TemporaryDirectory
] = tempfile.TemporaryDirectory()
self.path = self.dir_.name
else:
self.fs_meta = {}
self.dir_ = None
self.path = path

self.common_args = {
Expand Down Expand Up @@ -1194,6 +1198,9 @@ def close(self, return_metadata=False):
s3_file.put(local_path, s3_path, recursive=True)
shutil.rmtree(self.path)

if self.dir_ is not None:
self.dir_.cleanup()

if return_metadata:
return (
merge_parquet_filemetadata(metadata)
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/tests/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
SIGNED_TYPES,
_decimal_series,
assert_eq,
expect_warning_if,
)

data_ = [
Expand Down Expand Up @@ -200,7 +201,8 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
elif isinstance(to_dtype, Decimal64Dtype):
expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))

got = s.astype(to_dtype)
with expect_warning_if(to_dtype.scale < s.dtype.scale, UserWarning):
got = s.astype(to_dtype)

assert_eq(got, expected)

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_df_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
)

if dtype[0] != _DtypeKind.BOOL:
array_from_dlpack = cp.fromDlpack(buf.__dlpack__()).get()
array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get()
col_array = cp.asarray(cudfcol.data_array_view).get()
assert_eq(
array_from_dlpack[non_null_idxs.to_numpy()].flatten(),
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/tests/test_doctests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

import cudf

pytestmark = pytest.mark.filterwarnings("ignore::FutureWarning")


def _name_in_all(parent, name):
return name in getattr(parent, "__all__", [])
Expand Down
60 changes: 32 additions & 28 deletions python/cudf/cudf/tests/test_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import random

import numpy as np
import pandas as pd
import pytest
from pandas import DataFrame, MultiIndex, Series, date_range

import cudf
from cudf import concat
Expand Down Expand Up @@ -40,7 +40,7 @@ def assert_df2(g, p):

@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
def test_duplicated_with_misspelled_column_name(subset):
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
df = pd.DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
gdf = cudf.DataFrame.from_pandas(df)

assert_exceptions_equal(
Expand All @@ -59,7 +59,7 @@ def test_duplicated_with_misspelled_column_name(subset):
[1, 2, 4, 5, 6, 6],
[],
["a", "b", "s", "sd", "a", "b"],
Series(["aaa"] * 10, dtype="object"),
pd.Series(["aaa"] * 10, dtype="object"),
],
)
def test_drop_duplicates_series(data, keep):
Expand All @@ -73,7 +73,7 @@ def test_drop_duplicates_series(data, keep):


def test_drop_duplicates():
pdf = DataFrame(
pdf = pd.DataFrame(
{
"AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
Expand Down Expand Up @@ -146,36 +146,40 @@ def test_drop_duplicates():
expected = pdf.drop_duplicates("E", keep="last")
assert_df(result, expected)

pdf = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
pdf = pd.DataFrame(
{"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}
)
gdf = cudf.DataFrame.from_pandas(pdf)
assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

pdf = DataFrame([[1, 0], [0, 2]])
pdf = pd.DataFrame([[1, 0], [0, 2]])
gdf = cudf.DataFrame.from_pandas(pdf)
assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

pdf = DataFrame([[-2, 0], [0, -4]])
pdf = pd.DataFrame([[-2, 0], [0, -4]])
gdf = cudf.DataFrame.from_pandas(pdf)
assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

x = np.iinfo(np.int64).max / 3 * 2
pdf = DataFrame([[-x, x], [0, x + 4]])
pdf = pd.DataFrame([[-x, x], [0, x + 4]])
gdf = cudf.DataFrame.from_pandas(pdf)
assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

pdf = DataFrame([[-x, x], [x, x + 4]])
pdf = pd.DataFrame([[-x, x], [x, x + 4]])
gdf = cudf.DataFrame.from_pandas(pdf)
assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

pdf = DataFrame([i] * 9 for i in range(16))
pdf = pdf.append([[1] + [0] * 8], ignore_index=True)
pdf = pd.DataFrame([i] * 9 for i in range(16))
pdf = pd.concat([pdf, pd.DataFrame([[1] + [0] * 8])], ignore_index=True)
gdf = cudf.DataFrame.from_pandas(pdf)
assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())


@pytest.mark.skip(reason="cudf does not support duplicate column names yet")
def test_drop_duplicates_with_duplicate_column_names():
df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
df = pd.DataFrame(
[[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]
)
df = cudf.DataFrame.from_pandas(df)

result0 = df.drop_duplicates()
Expand All @@ -187,7 +191,7 @@ def test_drop_duplicates_with_duplicate_column_names():


def test_drop_duplicates_for_take_all():
pdf = DataFrame(
pdf = pd.DataFrame(
{
"AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
Expand Down Expand Up @@ -224,7 +228,7 @@ def test_drop_duplicates_for_take_all():


def test_drop_duplicates_tuple():
pdf = DataFrame(
pdf = pd.DataFrame(
{
("AA", "AB"): [
"foo",
Expand Down Expand Up @@ -265,11 +269,11 @@ def test_drop_duplicates_tuple():
@pytest.mark.parametrize(
"df",
[
DataFrame(),
DataFrame(columns=[]),
DataFrame(columns=["A", "B", "C"]),
DataFrame(index=[]),
DataFrame(index=["A", "B", "C"]),
pd.DataFrame(),
pd.DataFrame(columns=[]),
pd.DataFrame(columns=["A", "B", "C"]),
pd.DataFrame(index=[]),
pd.DataFrame(index=["A", "B", "C"]),
],
)
def test_drop_duplicates_empty(df):
Expand All @@ -292,7 +296,7 @@ def get_pdf(n_dup):
# create dataframe with n_dup duplicate rows
rows = comb + shuf[:n_dup]
random.Random(n_dup).shuffle(rows)
return DataFrame(rows)
return pd.DataFrame(rows)

for i in range(5):
pdf = get_pdf(i)
Expand Down Expand Up @@ -328,7 +332,7 @@ def get_pdf(n_dup):


def test_dataframe_drop_duplicates_method():
pdf = DataFrame(
pdf = pd.DataFrame(
[(1, 2, "a"), (2, 3, "b"), (3, 4, "c"), (2, 3, "d"), (3, 5, "c")],
columns=["n1", "n2", "s1"],
)
Expand Down Expand Up @@ -387,7 +391,7 @@ def test_dataframe_drop_duplicates_method():
def test_datetime_drop_duplicates():

date_df = cudf.DataFrame()
date_df["date"] = date_range("11/20/2018", periods=6, freq="D")
date_df["date"] = pd.date_range("11/20/2018", periods=6, freq="D")
date_df["value"] = np.random.sample(len(date_df))

df = concat([date_df, date_df[:4]])
Expand All @@ -402,7 +406,7 @@ def test_datetime_drop_duplicates():

def test_drop_duplicates_NA():
# none
df = DataFrame(
df = pd.DataFrame(
{
"A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
Expand Down Expand Up @@ -439,7 +443,7 @@ def test_drop_duplicates_NA():
assert_df(result, expected)

# nan
df = DataFrame(
df = pd.DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
Expand Down Expand Up @@ -481,7 +485,7 @@ def test_drop_duplicates_NA_for_take_all():
# pandas drop_duplicates calls in this function.

# none
pdf = DataFrame(
pdf = pd.DataFrame(
{
"A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"],
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0],
Expand Down Expand Up @@ -531,7 +535,7 @@ def test_drop_duplicates_NA_for_take_all():


def test_drop_duplicates_inplace():
orig = DataFrame(
orig = pd.DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
Expand Down Expand Up @@ -608,8 +612,8 @@ def test_drop_duplicates_multi_index():
["one", "two", "one", "two", "one", "two", "one", "two"],
]

idx = MultiIndex.from_tuples(list(zip(*arrays)), names=["a", "b"])
pdf = DataFrame(np.random.randint(0, 2, (8, 4)), index=idx)
idx = pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["a", "b"])
pdf = pd.DataFrame(np.random.randint(0, 2, (8, 4)), index=idx)
gdf = cudf.DataFrame.from_pandas(pdf)

expected = pdf.drop_duplicates()
Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/tests/test_hash_vocab.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
import filecmp
import os
import warnings

import pytest

Expand All @@ -20,6 +21,9 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):

groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
output_path = tmpdir.join("cudf-vocab-hash.txt")
hash_vocab(vocab_path, output_path)
with warnings.catch_warnings():
# See https://github.com/rapidsai/cudf/issues/12403
warnings.simplefilter(action="ignore", category=RuntimeWarning)
hash_vocab(vocab_path, output_path)

assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_joining.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,9 +768,9 @@ def test_merge_sort_on_indexes(kwargs):
definitely_sorted.index.name = None
assert_eq(gd_merge, definitely_sorted)
elif left_index:
assert gd_merge["b"].is_monotonic
assert gd_merge["b"].is_monotonic_increasing
elif right_index:
assert gd_merge["a"].is_monotonic
assert gd_merge["a"].is_monotonic_increasing


@pytest.mark.parametrize(
Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,8 @@ def test_json_corner_case_with_escape_and_double_quote_char_with_strings():
)
def test_json_to_json_compare_contents(gdf, pdf):
expected_json = pdf.to_json(lines=True, orient="records")
actual_json = gdf.to_json(lines=True, orient="records")
with pytest.warns(UserWarning):
actual_json = gdf.to_json(lines=True, orient="records")

assert expected_json == actual_json

Expand Down Expand Up @@ -988,7 +989,8 @@ def test_json_round_trip_gzip():
df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]})
bytes = BytesIO()
with gzip.open(bytes, mode="wb") as fo:
df.to_json(fo, orient="records", lines=True)
with pytest.warns(UserWarning):
df.to_json(fo, orient="records", lines=True)
bytes.seek(0)
with gzip.open(bytes, mode="rb") as fo:
written_df = cudf.read_json(fo, orient="records", lines=True)
Expand All @@ -999,7 +1001,8 @@ def test_json_round_trip_gzip():

with gzip.open(bytes, mode="wb") as fo:
fo.seek(loc)
df.to_json(fo, orient="records", lines=True)
with pytest.warns(UserWarning):
df.to_json(fo, orient="records", lines=True)
bytes.seek(loc)
with gzip.open(bytes, mode="rb") as fo:
fo.seek(loc)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_query(data, fn, nulls):
pdf["a"] = np.arange(nelem)
pdf["b"] = np.random.random(nelem) * nelem
if nulls:
pdf["a"][::2] = None
pdf.loc[::2, "a"] = None
gdf = cudf.from_pandas(pdf)
assert_eq(pdf.query(query_expr), gdf.query(query_expr))

Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/tests/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def test_rank_all_arguments(
assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs))
assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs))
if numeric_only:
expect = pdf["str"].rank(**kwargs)
with pytest.warns(FutureWarning):
expect = pdf["str"].rank(**kwargs)
got = gdf["str"].rank(**kwargs)
assert expect.empty == got.empty
expected = pdf.select_dtypes(include=np.number)
Expand Down

0 comments on commit 1d81fbb

Please sign in to comment.