Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce pytest runtime #10203

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 8 additions & 1 deletion python/cudf/cudf/testing/_utils.py
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import itertools
import re
import warnings
from collections.abc import Mapping, Sequence
Expand Down Expand Up @@ -330,3 +331,9 @@ def does_not_raise():

def xfail_param(param, **kwargs):
return pytest.param(param, marks=pytest.mark.xfail(**kwargs))


parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
"left_dtype,right_dtype",
list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
)
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -210,7 +210,7 @@ def test_can_parse_no_schema():
assert_eq(expected, actual)
bdice marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize("rows", [0, 1, 10, 100000])
@pytest.mark.parametrize("rows", [0, 1, 10, 1000])
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"])
def test_avro_compression(rows, codec):
schema = {
Expand Down
15 changes: 7 additions & 8 deletions python/cudf/cudf/tests/test_binops.py
Expand Up @@ -4,7 +4,7 @@
import decimal
import operator
import random
from itertools import product
from itertools import combinations_with_replacement, product

import cupy as cp
import numpy as np
Expand Down Expand Up @@ -216,13 +216,12 @@ def test_series_compare(cmpop, obj_class, dtype):


def _series_compare_nulls_typegen():
tests = []
tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
tests += list(product(STRING_TYPES, STRING_TYPES))

return tests
return [
*combinations_with_replacement(DATETIME_TYPES, 2),
*combinations_with_replacement(TIMEDELTA_TYPES, 2),
*combinations_with_replacement(NUMERIC_TYPES, 2),
*combinations_with_replacement(STRING_TYPES, 2),
]


@pytest.mark.parametrize("cmpop", _cmpops)
Expand Down
11 changes: 6 additions & 5 deletions python/cudf/cudf/tests/test_csv.py
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.

import gzip
import os
Expand All @@ -8,6 +8,7 @@
from io import BytesIO, StringIO
bdice marked this conversation as resolved.
Show resolved Hide resolved
from pathlib import Path

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir):
def test_csv_reader_carriage_return(tmpdir):
rows = 1000
names = ["int_row", "int_double_row"]

buffer = ",".join(names) + "\r\n"
for row in range(rows):
buffer += str(row) + ", " + str(2 * row) + "\r\n"

df = read_csv(StringIO(buffer))
expect = cudf.DataFrame(
{"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2}
)

assert len(df) == rows
for row in range(0, rows):
assert df[names[0]][row] == row
assert df[names[1]][row] == 2 * row
assert_eq(expect, df)


def test_csv_reader_tabs():
Expand Down
12 changes: 9 additions & 3 deletions python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,13 +1,17 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
import operator
bdice marked this conversation as resolved.
Show resolved Hide resolved

import cupy as cp
import numpy as np
import pytest
from numba import cuda, types
from numba.cuda import compile_ptx
from numba.np.numpy_support import from_dtype

from cudf import NA
from cudf.core.udf.api import Masked
from cudf.core.udf.typing import MaskedType
from cudf.testing._utils import parametrize_numeric_dtypes_pairwise

arith_ops = (
operator.add,
Expand Down Expand Up @@ -159,19 +163,21 @@ def func(x):


@pytest.mark.parametrize("op", ops)
@pytest.mark.parametrize("ty1", number_types, ids=number_ids)
@pytest.mark.parametrize("ty2", number_types, ids=number_ids)
@parametrize_numeric_dtypes_pairwise
@pytest.mark.parametrize(
"masked",
((False, True), (True, False), (True, True)),
ids=("um", "mu", "mm"),
)
def test_compile_arith_masked_ops(op, ty1, ty2, masked):
def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked):
def func(x, y):
return op(x, y)

cc = (7, 5)

ty1 = from_dtype(np.dtype(left_dtype))
ty2 = from_dtype(np.dtype(right_dtype))

if masked[0]:
ty1 = MaskedType(ty1)
if masked[1]:
Expand Down
46 changes: 22 additions & 24 deletions python/cudf/cudf/tests/test_indexing.py
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from itertools import combinations

Expand Down Expand Up @@ -1292,45 +1292,43 @@ def test_loc_datetime_index(sli, is_dataframe):


@pytest.mark.parametrize(
bdice marked this conversation as resolved.
Show resolved Hide resolved
"gdf",
"gdf_kwargs",
[
cudf.DataFrame({"a": range(1000000)}),
cudf.DataFrame({"a": range(1000000), "b": range(1000000)}),
cudf.DataFrame({"a": range(20), "b": range(20)}),
cudf.DataFrame(
{
{"data": {"a": range(100000)}},
{"data": {"a": range(100000), "b": range(100000)}},
{
"data": {
"a": range(20),
"b": range(20),
"c": ["abc", "def", "xyz", "def", "pqr"] * 4,
}
),
cudf.DataFrame(index=[1, 2, 3]),
cudf.DataFrame(index=range(1000000)),
cudf.DataFrame(columns=["a", "b", "c", "d"]),
cudf.DataFrame(columns=["a"], index=range(1000000)),
cudf.DataFrame(
columns=["a", "col2", "...col n"], index=range(1000000)
),
cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")),
cudf.DataFrame(
columns=["a", "b", "c", "d"],
index=cudf.Series(range(1000000)).astype("str"),
),
},
{"index": [1, 2, 3]},
{"index": range(100000)},
{"columns": ["a", "b", "c", "d"]},
{"columns": ["a"], "index": range(100000)},
{"columns": ["a", "col2", "...col n"], "index": range(100000)},
{"index": cudf.Series(range(100000)).astype("str")},
{
"columns": ["a", "b", "c", "d"],
"index": cudf.Series(range(100000)).astype("str"),
},
],
)
@pytest.mark.parametrize(
"slice",
[
slice(250000, 500000),
slice(250000, 250001),
slice(500000),
slice(25000, 50000),
slice(25000, 25001),
slice(50000),
slice(1, 10),
slice(10, 20),
slice(15, 24000),
slice(6),
Copy link
Contributor

@bdice bdice Feb 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're testing multiple combinations, we should have coverage of unique code paths: three-argument slices like slice(start, stop, step), negative indices, reversed slices, and empty slices. In the spirit of reducing runtime, some of the other cases can probably be removed if we aim for covering only unique cases. Also, I see no reason why we can't cut this test down to 100 rows instead of 100,000.

Suggested change
slice(6),
slice(6, None), # start but no stop, [6:]
slice(None, None, 3), # only step, [::3]
slice(1, 10, 2), # start, stop, step
slice(3, -5, 2), # negative stop
slice(-2, -4), # slice is empty
slice(-10, -20, -1), # reversed slice
slice(None), # slices everything, same as [:]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried some of these and we actually get multiple failures with these. Raising an issue now

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Glad I could help catch a bug here. Please tag me in that issue, I'm interested in seeing what you found. Slice all the things! 🥷⚔️🥷

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raised #10292

],
)
def test_dataframe_sliced(gdf, slice):
def test_dataframe_sliced(gdf_kwargs, slice):
gdf = cudf.DataFrame(**gdf_kwargs)
pdf = gdf.to_pandas()

actual = gdf[slice]
Expand Down
24 changes: 13 additions & 11 deletions python/cudf/cudf/tests/test_orc.py
Expand Up @@ -16,6 +16,7 @@

import cudf
from cudf.io.orc import ORCWriter
from cudf.testing import assert_frame_equal
from cudf.testing._utils import (
assert_eq,
gen_rand_series,
Expand Down Expand Up @@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
path, engine=engine, columns=columns, use_index=use_index
)

assert_eq(expect, got, check_categorical=False)
assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)


def test_orc_reader_filenotfound(tmpdir):
Expand Down Expand Up @@ -384,11 +385,13 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
else:
print(type(excpr).__name__)

expect = orcfile.read(columns=columns).to_pandas()
cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression)
got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas())
expect.to_orc(gdf_fname.strpath, compression=compression)
got = cudf.from_pandas(
pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
)

assert_eq(expect, got)
assert_frame_equal(expect, got)


@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
Expand All @@ -405,11 +408,11 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
else:
print(type(excpr).__name__)

expect = orcfile.read().to_pandas()
cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq)
got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
expect = cudf.from_pandas(orcfile.read().to_pandas())
expect.to_orc(gdf_fname.strpath, statistics=stats_freq)
got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas())

assert_eq(expect, got)
assert_frame_equal(expect, got)


@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
Expand Down Expand Up @@ -492,8 +495,7 @@ def test_chunked_orc_writer(
writer.close()

got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()

assert_eq(expect, got)
assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))


@pytest.mark.parametrize(
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_parquet.py
Expand Up @@ -1105,9 +1105,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir):
assert_eq(expect, got)


@pytest.mark.parametrize("skip", range(0, 128))
@pytest.mark.parametrize("skip", [0, 1, 5, 10])
def test_parquet_reader_list_skiprows(skip, tmpdir):
num_rows = 128
num_rows = 10
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I reason that that if 0:10 work then 11:128 should too.

Copy link
Contributor

@bdice bdice Feb 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe even replace range(0, 10) with [0, 1, 5, 10]. Maybe even search the tests for the regex parametrize.*range. 🙃

src = pd.DataFrame(
{
"a": list_gen(int_gen, 0, num_rows, 80, 50),
Expand All @@ -1124,9 +1124,9 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize("skip", range(0, 120))
@pytest.mark.parametrize("skip", [0, 1, 5, 10])
def test_parquet_reader_list_num_rows(skip, tmpdir):
num_rows = 128
num_rows = 20
src = pd.DataFrame(
{
"a": list_gen(int_gen, 0, num_rows, 80, 50),
Expand Down
45 changes: 19 additions & 26 deletions python/cudf/cudf/tests/test_repr.py
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

import textwrap

Expand All @@ -13,7 +13,14 @@
from cudf.testing import _utils as utils
bdice marked this conversation as resolved.
Show resolved Hide resolved
from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes

repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
repr_categories = [
"uint16",
"int64",
"float64",
"str",
"category",
"datetime64[ns]",
]


@pytest.mark.parametrize("dtype", repr_categories)
Expand Down Expand Up @@ -84,36 +91,22 @@ def test_full_series(nrows, dtype):
pd.reset_option("display.max_rows")


@pytest.mark.parametrize("nrows", [5, 10, 15])
@pytest.mark.parametrize("ncols", [5, 10, 15])
@pytest.mark.parametrize("size", [20, 21])
@pytest.mark.parametrize("dtype", repr_categories)
@pytest.mark.parametrize("nrows", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1])
@pytest.mark.parametrize("ncols", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1])
def test_full_dataframe_20(dtype, nrows, ncols):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just merge test_full_dataframe_20 & test_full_dataframe_21 by parametrizing size into one test and also with reduced parametrization of nrows & ncols?

size = 20
pdf = pd.DataFrame(
{idx: np.random.randint(0, 100, size) for idx in range(size)}
).astype(dtype)
gdf = cudf.from_pandas(pdf)

assert pdf.__repr__() == gdf.__repr__()
assert pdf._repr_html_() == gdf._repr_html_()
assert pdf._repr_latex_() == gdf._repr_latex_()


@pytest.mark.parametrize("dtype", repr_categories)
@pytest.mark.parametrize("nrows", [9, 21 / 2, 11, 21 - 1])
@pytest.mark.parametrize("ncols", [9, 21 / 2, 11, 21 - 1])
def test_full_dataframe_21(dtype, nrows, ncols):
size = 21
def test_full_dataframe_20(dtype, size, nrows, ncols):
pdf = pd.DataFrame(
{idx: np.random.randint(0, 100, size) for idx in range(size)}
).astype(dtype)
gdf = cudf.from_pandas(pdf)

pd.options.display.max_rows = int(nrows)
pd.options.display.max_columns = int(ncols)
assert pdf.__repr__() == gdf.__repr__()
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
with pd.option_context(
"display.max_rows", int(nrows), "display.max_columns", int(ncols)
):
assert repr(pdf) == repr(gdf)
assert pdf._repr_html_() == gdf._repr_html_()
assert pdf._repr_latex_() == gdf._repr_latex_()


@given(
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_reshape.py
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

import re

Expand All @@ -17,9 +17,9 @@
)
bdice marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize("num_id_vars", [0, 1, 2, 10])
@pytest.mark.parametrize("num_value_vars", [0, 1, 2, 10])
@pytest.mark.parametrize("num_rows", [1, 2, 1000])
@pytest.mark.parametrize("num_id_vars", [0, 1, 2])
@pytest.mark.parametrize("num_value_vars", [0, 1, 2])
@pytest.mark.parametrize("num_rows", [1, 2, 100])
@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
@pytest.mark.parametrize("nulls", ["none", "some", "all"])
def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
Expand Down