Fix warnings in remaining modules (#12406)

Contributes to #9999 and #10363. When I merge these changes with #12369 I no longer see any warnings on my machine. I suspect that there will be slightly different results on different machines, so we'll see have to see how CI looks after both PRs are merged before we close #10363. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) URL: #12406
rapidsai · Dec 21, 2022 · 1d81fbb · 1d81fbb
1 parent b533259
commit 1d81fbb
Show file tree

Hide file tree

Showing 19 changed files with 179 additions and 96 deletions.
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -987,10 +987,11 @@ def replace(
             raise NotImplementedError("`flags` parameter is not yet supported")
 
         if can_convert_to_column(pat) and can_convert_to_column(repl):
-            warnings.warn(
-                "`n` parameter is not supported when "
-                "`pat` and `repl` are list-like inputs"
-            )
+            if n != -1:
+                warnings.warn(
+                    "`n` parameter is not supported when "
+                    "`pat` and `repl` are list-like inputs"
+                )
 
             return self._return_or_inplace(
                 libstrings.replace_multi_re(

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -2231,18 +2231,18 @@ def update(self, other):
         If ``other`` contains NaNs the corresponding values are not updated
         in the original Series.
 
-        >>> s = cudf.Series([1, 2, 3])
+        >>> s = cudf.Series([1.0, 2.0, 3.0])
         >>> s
-        0    1
-        1    2
-        2    3
-        dtype: int64
-        >>> s.update(cudf.Series([4, np.nan, 6], nan_as_null=False))
+        0    1.0
+        1    2.0
+        2    3.0
+        dtype: float64
+        >>> s.update(cudf.Series([4.0, np.nan, 6.0], nan_as_null=False))
         >>> s
-        0    4
-        1    2
-        2    6
-        dtype: int64
+        0    4.0
+        1    2.0
+        2    6.0
+        dtype: float64
 
         ``other`` can also be a non-Series object type
         that is coercible into a Series

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -6,7 +6,7 @@
 import warnings
 from collections import defaultdict
 from contextlib import ExitStack
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 from uuid import uuid4
 
 from pyarrow import dataset as ds, parquet as pq
@@ -1010,9 +1010,13 @@ def __init__(
     ) -> None:
         if isinstance(path, str) and path.startswith("s3://"):
             self.fs_meta = {"is_s3": True, "actual_path": path}
-            self.path = tempfile.TemporaryDirectory().name
+            self.dir_: Optional[
+                tempfile.TemporaryDirectory
+            ] = tempfile.TemporaryDirectory()
+            self.path = self.dir_.name
         else:
             self.fs_meta = {}
+            self.dir_ = None
             self.path = path
 
         self.common_args = {
@@ -1194,6 +1198,9 @@ def close(self, return_metadata=False):
             s3_file.put(local_path, s3_path, recursive=True)
             shutil.rmtree(self.path)
 
+        if self.dir_ is not None:
+            self.dir_.cleanup()
+
         if return_metadata:
             return (
                 merge_parquet_filemetadata(metadata)

diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
@@ -16,6 +16,7 @@
     SIGNED_TYPES,
     _decimal_series,
     assert_eq,
+    expect_warning_if,
 )
 
 data_ = [
@@ -200,7 +201,8 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
     elif isinstance(to_dtype, Decimal64Dtype):
         expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
 
-    got = s.astype(to_dtype)
+    with expect_warning_if(to_dtype.scale < s.dtype.scale, UserWarning):
+        got = s.astype(to_dtype)
 
     assert_eq(got, expected)
 

diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
@@ -40,7 +40,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     )
 
     if dtype[0] != _DtypeKind.BOOL:
-        array_from_dlpack = cp.fromDlpack(buf.__dlpack__()).get()
+        array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get()
         col_array = cp.asarray(cudfcol.data_array_view).get()
         assert_eq(
             array_from_dlpack[non_null_idxs.to_numpy()].flatten(),

diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py
@@ -10,6 +10,8 @@
 
 import cudf
 
+pytestmark = pytest.mark.filterwarnings("ignore::FutureWarning")
+
 
 def _name_in_all(parent, name):
     return name in getattr(parent, "__all__", [])

diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
@@ -4,8 +4,8 @@
 import random
 
 import numpy as np
+import pandas as pd
 import pytest
-from pandas import DataFrame, MultiIndex, Series, date_range
 
 import cudf
 from cudf import concat
@@ -40,7 +40,7 @@ def assert_df2(g, p):
 
 @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
 def test_duplicated_with_misspelled_column_name(subset):
-    df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
+    df = pd.DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
     gdf = cudf.DataFrame.from_pandas(df)
 
     assert_exceptions_equal(
@@ -59,7 +59,7 @@ def test_duplicated_with_misspelled_column_name(subset):
         [1, 2, 4, 5, 6, 6],
         [],
         ["a", "b", "s", "sd", "a", "b"],
-        Series(["aaa"] * 10, dtype="object"),
+        pd.Series(["aaa"] * 10, dtype="object"),
     ],
 )
 def test_drop_duplicates_series(data, keep):
@@ -73,7 +73,7 @@ def test_drop_duplicates_series(data, keep):
 
 
 def test_drop_duplicates():
-    pdf = DataFrame(
+    pdf = pd.DataFrame(
         {
             "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
             "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
@@ -146,36 +146,40 @@ def test_drop_duplicates():
     expected = pdf.drop_duplicates("E", keep="last")
     assert_df(result, expected)
 
-    pdf = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
+    pdf = pd.DataFrame(
+        {"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}
+    )
     gdf = cudf.DataFrame.from_pandas(pdf)
     assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())
 
-    pdf = DataFrame([[1, 0], [0, 2]])
+    pdf = pd.DataFrame([[1, 0], [0, 2]])
     gdf = cudf.DataFrame.from_pandas(pdf)
     assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())
 
-    pdf = DataFrame([[-2, 0], [0, -4]])
+    pdf = pd.DataFrame([[-2, 0], [0, -4]])
     gdf = cudf.DataFrame.from_pandas(pdf)
     assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())
 
     x = np.iinfo(np.int64).max / 3 * 2
-    pdf = DataFrame([[-x, x], [0, x + 4]])
+    pdf = pd.DataFrame([[-x, x], [0, x + 4]])
     gdf = cudf.DataFrame.from_pandas(pdf)
     assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())
 
-    pdf = DataFrame([[-x, x], [x, x + 4]])
+    pdf = pd.DataFrame([[-x, x], [x, x + 4]])
     gdf = cudf.DataFrame.from_pandas(pdf)
     assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())
 
-    pdf = DataFrame([i] * 9 for i in range(16))
-    pdf = pdf.append([[1] + [0] * 8], ignore_index=True)
+    pdf = pd.DataFrame([i] * 9 for i in range(16))
+    pdf = pd.concat([pdf, pd.DataFrame([[1] + [0] * 8])], ignore_index=True)
     gdf = cudf.DataFrame.from_pandas(pdf)
     assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())
 
 
 @pytest.mark.skip(reason="cudf does not support duplicate column names yet")
 def test_drop_duplicates_with_duplicate_column_names():
-    df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
+    df = pd.DataFrame(
+        [[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]
+    )
     df = cudf.DataFrame.from_pandas(df)
 
     result0 = df.drop_duplicates()
@@ -187,7 +191,7 @@ def test_drop_duplicates_with_duplicate_column_names():
 
 
 def test_drop_duplicates_for_take_all():
-    pdf = DataFrame(
+    pdf = pd.DataFrame(
         {
             "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"],
             "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
@@ -224,7 +228,7 @@ def test_drop_duplicates_for_take_all():
 
 
 def test_drop_duplicates_tuple():
-    pdf = DataFrame(
+    pdf = pd.DataFrame(
         {
             ("AA", "AB"): [
                 "foo",
@@ -265,11 +269,11 @@ def test_drop_duplicates_tuple():
 @pytest.mark.parametrize(
     "df",
     [
-        DataFrame(),
-        DataFrame(columns=[]),
-        DataFrame(columns=["A", "B", "C"]),
-        DataFrame(index=[]),
-        DataFrame(index=["A", "B", "C"]),
+        pd.DataFrame(),
+        pd.DataFrame(columns=[]),
+        pd.DataFrame(columns=["A", "B", "C"]),
+        pd.DataFrame(index=[]),
+        pd.DataFrame(index=["A", "B", "C"]),
     ],
 )
 def test_drop_duplicates_empty(df):
@@ -292,7 +296,7 @@ def get_pdf(n_dup):
         # create dataframe with n_dup duplicate rows
         rows = comb + shuf[:n_dup]
         random.Random(n_dup).shuffle(rows)
-        return DataFrame(rows)
+        return pd.DataFrame(rows)
 
     for i in range(5):
         pdf = get_pdf(i)
@@ -328,7 +332,7 @@ def get_pdf(n_dup):
 
 
 def test_dataframe_drop_duplicates_method():
-    pdf = DataFrame(
+    pdf = pd.DataFrame(
         [(1, 2, "a"), (2, 3, "b"), (3, 4, "c"), (2, 3, "d"), (3, 5, "c")],
         columns=["n1", "n2", "s1"],
     )
@@ -387,7 +391,7 @@ def test_dataframe_drop_duplicates_method():
 def test_datetime_drop_duplicates():
 
     date_df = cudf.DataFrame()
-    date_df["date"] = date_range("11/20/2018", periods=6, freq="D")
+    date_df["date"] = pd.date_range("11/20/2018", periods=6, freq="D")
     date_df["value"] = np.random.sample(len(date_df))
 
     df = concat([date_df, date_df[:4]])
@@ -402,7 +406,7 @@ def test_datetime_drop_duplicates():
 
 def test_drop_duplicates_NA():
     # none
-    df = DataFrame(
+    df = pd.DataFrame(
         {
             "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
             "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
@@ -439,7 +443,7 @@ def test_drop_duplicates_NA():
     assert_df(result, expected)
 
     # nan
-    df = DataFrame(
+    df = pd.DataFrame(
         {
             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
             "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
@@ -481,7 +485,7 @@ def test_drop_duplicates_NA_for_take_all():
     # pandas drop_duplicates calls in this function.
 
     # none
-    pdf = DataFrame(
+    pdf = pd.DataFrame(
         {
             "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"],
             "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0],
@@ -531,7 +535,7 @@ def test_drop_duplicates_NA_for_take_all():
 
 
 def test_drop_duplicates_inplace():
-    orig = DataFrame(
+    orig = pd.DataFrame(
         {
             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
             "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
@@ -608,8 +612,8 @@ def test_drop_duplicates_multi_index():
         ["one", "two", "one", "two", "one", "two", "one", "two"],
     ]
 
-    idx = MultiIndex.from_tuples(list(zip(*arrays)), names=["a", "b"])
-    pdf = DataFrame(np.random.randint(0, 2, (8, 4)), index=idx)
+    idx = pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["a", "b"])
+    pdf = pd.DataFrame(np.random.randint(0, 2, (8, 4)), index=idx)
     gdf = cudf.DataFrame.from_pandas(pdf)
 
     expected = pdf.drop_duplicates()

diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 import filecmp
 import os
+import warnings
 
 import pytest
 
@@ -20,6 +21,9 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):
 
     groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
     output_path = tmpdir.join("cudf-vocab-hash.txt")
-    hash_vocab(vocab_path, output_path)
+    with warnings.catch_warnings():
+        # See https://github.com/rapidsai/cudf/issues/12403
+        warnings.simplefilter(action="ignore", category=RuntimeWarning)
+        hash_vocab(vocab_path, output_path)
 
     assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
@@ -768,9 +768,9 @@ def test_merge_sort_on_indexes(kwargs):
         definitely_sorted.index.name = None
         assert_eq(gd_merge, definitely_sorted)
     elif left_index:
-        assert gd_merge["b"].is_monotonic
+        assert gd_merge["b"].is_monotonic_increasing
     elif right_index:
-        assert gd_merge["a"].is_monotonic
+        assert gd_merge["a"].is_monotonic_increasing
 
 
 @pytest.mark.parametrize(

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
@@ -512,7 +512,8 @@ def test_json_corner_case_with_escape_and_double_quote_char_with_strings():
 )
 def test_json_to_json_compare_contents(gdf, pdf):
     expected_json = pdf.to_json(lines=True, orient="records")
-    actual_json = gdf.to_json(lines=True, orient="records")
+    with pytest.warns(UserWarning):
+        actual_json = gdf.to_json(lines=True, orient="records")
 
     assert expected_json == actual_json
 
@@ -988,7 +989,8 @@ def test_json_round_trip_gzip():
     df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]})
     bytes = BytesIO()
     with gzip.open(bytes, mode="wb") as fo:
-        df.to_json(fo, orient="records", lines=True)
+        with pytest.warns(UserWarning):
+            df.to_json(fo, orient="records", lines=True)
     bytes.seek(0)
     with gzip.open(bytes, mode="rb") as fo:
         written_df = cudf.read_json(fo, orient="records", lines=True)
@@ -999,7 +1001,8 @@ def test_json_round_trip_gzip():
 
     with gzip.open(bytes, mode="wb") as fo:
         fo.seek(loc)
-        df.to_json(fo, orient="records", lines=True)
+        with pytest.warns(UserWarning):
+            df.to_json(fo, orient="records", lines=True)
     bytes.seek(loc)
     with gzip.open(bytes, mode="rb") as fo:
         fo.seek(loc)

diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
@@ -50,7 +50,7 @@ def test_query(data, fn, nulls):
     pdf["a"] = np.arange(nelem)
     pdf["b"] = np.random.random(nelem) * nelem
     if nulls:
-        pdf["a"][::2] = None
+        pdf.loc[::2, "a"] = None
     gdf = cudf.from_pandas(pdf)
     assert_eq(pdf.query(query_expr), gdf.query(query_expr))
 

diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
@@ -55,7 +55,8 @@ def test_rank_all_arguments(
     assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs))
     assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs))
     if numeric_only:
-        expect = pdf["str"].rank(**kwargs)
+        with pytest.warns(FutureWarning):
+            expect = pdf["str"].rank(**kwargs)
         got = gdf["str"].rank(**kwargs)
         assert expect.empty == got.empty
         expected = pdf.select_dtypes(include=np.number)