Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce pytest runtime #10203

Merged
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 8 additions & 1 deletion python/cudf/cudf/testing/_utils.py
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import itertools
import re
import warnings
from collections.abc import Mapping, Sequence
Expand Down Expand Up @@ -330,3 +331,9 @@ def does_not_raise():

def xfail_param(param, **kwargs):
return pytest.param(param, marks=pytest.mark.xfail(**kwargs))


parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
"left_dtype,right_dtype",
list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
)
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2022, NVIDIA CORPORATION.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

?

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -210,7 +210,7 @@ def test_can_parse_no_schema():
assert_eq(expected, actual)
bdice marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize("rows", [0, 1, 10, 100000])
@pytest.mark.parametrize("rows", [0, 1, 10, 1000])
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"])
def test_avro_compression(rows, codec):
schema = {
Expand Down
15 changes: 7 additions & 8 deletions python/cudf/cudf/tests/test_binops.py
Expand Up @@ -4,7 +4,7 @@
import decimal
import operator
import random
from itertools import product
from itertools import combinations_with_replacement, product

import cupy as cp
import numpy as np
Expand Down Expand Up @@ -216,13 +216,12 @@ def test_series_compare(cmpop, obj_class, dtype):


def _series_compare_nulls_typegen():
tests = []
tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
tests += list(product(STRING_TYPES, STRING_TYPES))

return tests
return [
*combinations_with_replacement(DATETIME_TYPES, 2),
*combinations_with_replacement(TIMEDELTA_TYPES, 2),
*combinations_with_replacement(NUMERIC_TYPES, 2),
*combinations_with_replacement(STRING_TYPES, 2),
]


@pytest.mark.parametrize("cmpop", _cmpops)
Expand Down
11 changes: 6 additions & 5 deletions python/cudf/cudf/tests/test_csv.py
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.

import gzip
import os
Expand All @@ -8,6 +8,7 @@
from io import BytesIO, StringIO
bdice marked this conversation as resolved.
Show resolved Hide resolved
from pathlib import Path

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir):
def test_csv_reader_carriage_return(tmpdir):
rows = 1000
names = ["int_row", "int_double_row"]

buffer = ",".join(names) + "\r\n"
for row in range(rows):
buffer += str(row) + ", " + str(2 * row) + "\r\n"

df = read_csv(StringIO(buffer))
expect = cudf.DataFrame(
{"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2}
)

assert len(df) == rows
for row in range(0, rows):
assert df[names[0]][row] == row
assert df[names[1]][row] == 2 * row
assert_eq(expect, df)


def test_csv_reader_tabs():
Expand Down
38 changes: 0 additions & 38 deletions python/cudf/cudf/tests/test_dataframe.py
Expand Up @@ -6670,7 +6670,6 @@ def test_dataframe_info_null_counts():
"data1",
[
[1, 2, 3, 4, 5, 6, 7],
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
[
1.9876543,
2.9876654,
Expand All @@ -6689,31 +6688,12 @@ def test_dataframe_info_null_counts():
-6.88918237,
-7.00001,
],
[
1.987654321,
2.987654321,
3.987654321,
0.1221,
2.1221,
0.112121,
-21.1212,
],
[
-1.987654321,
-2.987654321,
-3.987654321,
-0.1221,
-2.1221,
-0.112121,
21.1212,
],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I am just deleting tests that might be redundant. I am basing this off the assumption that all these different sets of numbers are not really increasing test coverage.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For some values of rtol and atol, the deleted values seem to be testing the behavior of isclose at a granularity finer than float32 (requiring float64s for accuracy). That seems potentially important, and would deserve a code comment explaining the coverage that each parameter set adds. Instead of deleting these, we might reframe these parameters so that we don't test a whole matrix of all input values and all atol/rtol values (6x6x6x6 = 1296 tests), and only test certain pieces of the matrix that actually cover the important pieces of the behavior (comparing ints and floats, testing isclose at float64 precision, etc.).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAICT the these tests were only covering float64 both before and after the change. The dataframe and array constructors implicitly upcast the incoming data, so it's always getting compared as float64. Is your concern that we're not covering float32 at all here?

Copy link
Contributor

@bdice bdice Feb 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I wasn't very clear. The issue isn't really about data types. It's about the rtol / atol precision requiring float64 precision, which these changes no longer test adequately. The real-world cases of isclose I have seen use very tight tolerances (sometimes tighter than float32 precision, like 1e-08 for data on the order of 1). Currently, this PR removes the input data that is designed to test those cases of tight tolerances.

If you look at the data1/data2 values like 1.987654321 and 1.9876543, those pairs of input data are meant to be compared with the rtol/atol values of 1e-08 in the other set of parameters. If we remove the more-precise values here, we aren't getting good coverage of the tighter tolerance 1e-08, which requires float64 precision to get the correct results. By removing these pairings of parametrized values, this test would no longer fail if the isclose functions in cuDF or cupy were to erroneously cast their inputs to float32.

I agree that this test is grossly over-parametrized, but the deletions here are removing an important case to check.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just undo these changes and add a TODO that we can be more clever about parametrizing this particular test. The other changes in this PR give a more-than-meaningful improvement in test time and I don't think it's worth investing much more time over this single test at the moment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that would be good. I can file an issue describing what I've discussed with @brandon-b-miller via Slack.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me. Thank you, Bradley!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue #10284 filed. I can put this in my backlog of things to do, or I can help someone else construct the specific cases I have in mind for test coverage.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change has been reverted

],
)
@pytest.mark.parametrize(
"data2",
[
[1, 2, 3, 4, 5, 6, 7],
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
[
1.9876543,
2.9876654,
Expand All @@ -6732,24 +6712,6 @@ def test_dataframe_info_null_counts():
-6.88918237,
-7.00001,
],
[
1.987654321,
2.987654321,
3.987654321,
0.1221,
2.1221,
0.112121,
-21.1212,
],
[
-1.987654321,
-2.987654321,
-3.987654321,
-0.1221,
-2.1221,
-0.112121,
21.1212,
],
],
)
@pytest.mark.parametrize("rtol", [0, 0.01, 1e-05, 1e-08, 5e-1, 50.12])
Expand Down
12 changes: 9 additions & 3 deletions python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,13 +1,17 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
import operator
bdice marked this conversation as resolved.
Show resolved Hide resolved

import cupy as cp
import numpy as np
import pytest
from numba import cuda, types
from numba.cuda import compile_ptx
from numba.np.numpy_support import from_dtype

from cudf import NA
from cudf.core.udf.api import Masked
from cudf.core.udf.typing import MaskedType
from cudf.testing._utils import parametrize_numeric_dtypes_pairwise

arith_ops = (
operator.add,
Expand Down Expand Up @@ -159,19 +163,21 @@ def func(x):


@pytest.mark.parametrize("op", ops)
@pytest.mark.parametrize("ty1", number_types, ids=number_ids)
@pytest.mark.parametrize("ty2", number_types, ids=number_ids)
@parametrize_numeric_dtypes_pairwise
@pytest.mark.parametrize(
"masked",
((False, True), (True, False), (True, True)),
ids=("um", "mu", "mm"),
)
def test_compile_arith_masked_ops(op, ty1, ty2, masked):
def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked):
def func(x, y):
return op(x, y)

cc = (7, 5)

ty1 = from_dtype(np.dtype(left_dtype))
ty2 = from_dtype(np.dtype(right_dtype))

if masked[0]:
ty1 = MaskedType(ty1)
if masked[1]:
Expand Down
46 changes: 22 additions & 24 deletions python/cudf/cudf/tests/test_indexing.py
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from itertools import combinations

Expand Down Expand Up @@ -1292,45 +1292,43 @@ def test_loc_datetime_index(sli, is_dataframe):


@pytest.mark.parametrize(
bdice marked this conversation as resolved.
Show resolved Hide resolved
"gdf",
"gdf_kwargs",
[
cudf.DataFrame({"a": range(1000000)}),
cudf.DataFrame({"a": range(1000000), "b": range(1000000)}),
cudf.DataFrame({"a": range(20), "b": range(20)}),
cudf.DataFrame(
{
{"data": {"a": range(100000)}},
{"data": {"a": range(100000), "b": range(100000)}},
{
"data": {
"a": range(20),
"b": range(20),
"c": ["abc", "def", "xyz", "def", "pqr"] * 4,
}
),
cudf.DataFrame(index=[1, 2, 3]),
cudf.DataFrame(index=range(1000000)),
cudf.DataFrame(columns=["a", "b", "c", "d"]),
cudf.DataFrame(columns=["a"], index=range(1000000)),
cudf.DataFrame(
columns=["a", "col2", "...col n"], index=range(1000000)
),
cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")),
cudf.DataFrame(
columns=["a", "b", "c", "d"],
index=cudf.Series(range(1000000)).astype("str"),
),
},
{"index": [1, 2, 3]},
{"index": range(100000)},
{"columns": ["a", "b", "c", "d"]},
{"columns": ["a"], "index": range(100000)},
{"columns": ["a", "col2", "...col n"], "index": range(100000)},
{"index": cudf.Series(range(100000)).astype("str")},
{
"columns": ["a", "b", "c", "d"],
"index": cudf.Series(range(100000)).astype("str"),
},
],
)
@pytest.mark.parametrize(
"slice",
[
slice(250000, 500000),
slice(250000, 250001),
slice(500000),
slice(25000, 50000),
slice(25000, 25001),
slice(50000),
slice(1, 10),
slice(10, 20),
slice(15, 24000),
slice(6),
Copy link
Contributor

@bdice bdice Feb 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're testing multiple combinations, we should have coverage of unique code paths: three-argument slices like slice(start, stop, step), negative indices, reversed slices, and empty slices. In the spirit of reducing runtime, some of the other cases can probably be removed if we aim for covering only unique cases. Also, I see no reason why we can't cut this test down to 100 rows instead of 100,000.

Suggested change
slice(6),
slice(6, None), # start but no stop, [6:]
slice(None, None, 3), # only step, [::3]
slice(1, 10, 2), # start, stop, step
slice(3, -5, 2), # negative stop
slice(-2, -4), # slice is empty
slice(-10, -20, -1), # reversed slice
slice(None), # slices everything, same as [:]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried some of these and we actually get multiple failures with these. Raising an issue now

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Glad I could help catch a bug here. Please tag me in that issue, I'm interested in seeing what you found. Slice all the things! 🥷⚔️🥷

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raised #10292

],
)
def test_dataframe_sliced(gdf, slice):
def test_dataframe_sliced(gdf_kwargs, slice):
gdf = cudf.DataFrame(**gdf_kwargs)
pdf = gdf.to_pandas()

actual = gdf[slice]
Expand Down
24 changes: 13 additions & 11 deletions python/cudf/cudf/tests/test_orc.py
Expand Up @@ -16,6 +16,7 @@

import cudf
from cudf.io.orc import ORCWriter
from cudf.testing import assert_frame_equal
from cudf.testing._utils import (
assert_eq,
gen_rand_series,
Expand Down Expand Up @@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
path, engine=engine, columns=columns, use_index=use_index
)

assert_eq(expect, got, check_categorical=False)
assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)


def test_orc_reader_filenotfound(tmpdir):
Expand Down Expand Up @@ -384,11 +385,13 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
else:
print(type(excpr).__name__)

expect = orcfile.read(columns=columns).to_pandas()
cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression)
got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas())
expect.to_orc(gdf_fname.strpath, compression=compression)
got = cudf.from_pandas(
pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
)

assert_eq(expect, got)
assert_frame_equal(expect, got)


@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
Expand All @@ -405,11 +408,11 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
else:
print(type(excpr).__name__)

expect = orcfile.read().to_pandas()
cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq)
got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
expect = cudf.from_pandas(orcfile.read().to_pandas())
expect.to_orc(gdf_fname.strpath, statistics=stats_freq)
got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas())

assert_eq(expect, got)
assert_frame_equal(expect, got)


@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
Expand Down Expand Up @@ -492,8 +495,7 @@ def test_chunked_orc_writer(
writer.close()

got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()

assert_eq(expect, got)
assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))


@pytest.mark.parametrize(
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_parquet.py
Expand Up @@ -1105,9 +1105,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir):
assert_eq(expect, got)


@pytest.mark.parametrize("skip", range(0, 128))
@pytest.mark.parametrize("skip", [0, 1, 5, 10])
def test_parquet_reader_list_skiprows(skip, tmpdir):
num_rows = 128
num_rows = 10
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I reason that that if 0:10 work then 11:128 should too.

Copy link
Contributor

@bdice bdice Feb 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe even replace range(0, 10) with [0, 1, 5, 10]. Maybe even search the tests for the regex parametrize.*range. 🙃

src = pd.DataFrame(
{
"a": list_gen(int_gen, 0, num_rows, 80, 50),
Expand All @@ -1124,9 +1124,9 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize("skip", range(0, 120))
@pytest.mark.parametrize("skip", [0, 1, 5, 10])
def test_parquet_reader_list_num_rows(skip, tmpdir):
num_rows = 128
num_rows = 20
src = pd.DataFrame(
{
"a": list_gen(int_gen, 0, num_rows, 80, 50),
Expand Down