Skip to content

Commit

Permalink
Reduce pytest runtime (#10203)
Browse files Browse the repository at this point in the history
This PR reduces the overall runtime of the cuDF pytest suite. Changes include:

- asserting equal on the GPU where possible for large datasets
- in some cases reducing excessive test data size

part of #9999

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)

URL: #10203
  • Loading branch information
brandon-b-miller committed Feb 15, 2022
1 parent 8b0737d commit 851e235
Show file tree
Hide file tree
Showing 12 changed files with 106 additions and 101 deletions.
9 changes: 8 additions & 1 deletion python/cudf/cudf/testing/_utils.py
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import itertools
import re
import warnings
from collections.abc import Mapping, Sequence
Expand Down Expand Up @@ -330,3 +331,9 @@ def does_not_raise():

def xfail_param(param, **kwargs):
return pytest.param(param, marks=pytest.mark.xfail(**kwargs))


parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
"left_dtype,right_dtype",
list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
)
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -210,7 +210,7 @@ def test_can_parse_no_schema():
assert_eq(expected, actual)


@pytest.mark.parametrize("rows", [0, 1, 10, 100000])
@pytest.mark.parametrize("rows", [0, 1, 10, 1000])
@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"])
def test_avro_compression(rows, codec):
schema = {
Expand Down
15 changes: 7 additions & 8 deletions python/cudf/cudf/tests/test_binops.py
Expand Up @@ -4,7 +4,7 @@
import decimal
import operator
import random
from itertools import product
from itertools import combinations_with_replacement, product

import cupy as cp
import numpy as np
Expand Down Expand Up @@ -216,13 +216,12 @@ def test_series_compare(cmpop, obj_class, dtype):


def _series_compare_nulls_typegen():
tests = []
tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
tests += list(product(STRING_TYPES, STRING_TYPES))

return tests
return [
*combinations_with_replacement(DATETIME_TYPES, 2),
*combinations_with_replacement(TIMEDELTA_TYPES, 2),
*combinations_with_replacement(NUMERIC_TYPES, 2),
*combinations_with_replacement(STRING_TYPES, 2),
]


@pytest.mark.parametrize("cmpop", _cmpops)
Expand Down
11 changes: 6 additions & 5 deletions python/cudf/cudf/tests/test_csv.py
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.

import gzip
import os
Expand All @@ -8,6 +8,7 @@
from io import BytesIO, StringIO
from pathlib import Path

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir):
def test_csv_reader_carriage_return(tmpdir):
rows = 1000
names = ["int_row", "int_double_row"]

buffer = ",".join(names) + "\r\n"
for row in range(rows):
buffer += str(row) + ", " + str(2 * row) + "\r\n"

df = read_csv(StringIO(buffer))
expect = cudf.DataFrame(
{"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2}
)

assert len(df) == rows
for row in range(0, rows):
assert df[names[0]][row] == row
assert df[names[1]][row] == 2 * row
assert_eq(expect, df)


def test_csv_reader_tabs():
Expand Down
12 changes: 9 additions & 3 deletions python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,13 +1,17 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
import operator

import cupy as cp
import numpy as np
import pytest
from numba import cuda, types
from numba.cuda import compile_ptx
from numba.np.numpy_support import from_dtype

from cudf import NA
from cudf.core.udf.api import Masked
from cudf.core.udf.typing import MaskedType
from cudf.testing._utils import parametrize_numeric_dtypes_pairwise

arith_ops = (
operator.add,
Expand Down Expand Up @@ -159,19 +163,21 @@ def func(x):


@pytest.mark.parametrize("op", ops)
@pytest.mark.parametrize("ty1", number_types, ids=number_ids)
@pytest.mark.parametrize("ty2", number_types, ids=number_ids)
@parametrize_numeric_dtypes_pairwise
@pytest.mark.parametrize(
"masked",
((False, True), (True, False), (True, True)),
ids=("um", "mu", "mm"),
)
def test_compile_arith_masked_ops(op, ty1, ty2, masked):
def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked):
def func(x, y):
return op(x, y)

cc = (7, 5)

ty1 = from_dtype(np.dtype(left_dtype))
ty2 = from_dtype(np.dtype(right_dtype))

if masked[0]:
ty1 = MaskedType(ty1)
if masked[1]:
Expand Down
46 changes: 22 additions & 24 deletions python/cudf/cudf/tests/test_indexing.py
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from itertools import combinations

Expand Down Expand Up @@ -1292,45 +1292,43 @@ def test_loc_datetime_index(sli, is_dataframe):


@pytest.mark.parametrize(
"gdf",
"gdf_kwargs",
[
cudf.DataFrame({"a": range(1000000)}),
cudf.DataFrame({"a": range(1000000), "b": range(1000000)}),
cudf.DataFrame({"a": range(20), "b": range(20)}),
cudf.DataFrame(
{
{"data": {"a": range(100000)}},
{"data": {"a": range(100000), "b": range(100000)}},
{
"data": {
"a": range(20),
"b": range(20),
"c": ["abc", "def", "xyz", "def", "pqr"] * 4,
}
),
cudf.DataFrame(index=[1, 2, 3]),
cudf.DataFrame(index=range(1000000)),
cudf.DataFrame(columns=["a", "b", "c", "d"]),
cudf.DataFrame(columns=["a"], index=range(1000000)),
cudf.DataFrame(
columns=["a", "col2", "...col n"], index=range(1000000)
),
cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")),
cudf.DataFrame(
columns=["a", "b", "c", "d"],
index=cudf.Series(range(1000000)).astype("str"),
),
},
{"index": [1, 2, 3]},
{"index": range(100000)},
{"columns": ["a", "b", "c", "d"]},
{"columns": ["a"], "index": range(100000)},
{"columns": ["a", "col2", "...col n"], "index": range(100000)},
{"index": cudf.Series(range(100000)).astype("str")},
{
"columns": ["a", "b", "c", "d"],
"index": cudf.Series(range(100000)).astype("str"),
},
],
)
@pytest.mark.parametrize(
"slice",
[
slice(250000, 500000),
slice(250000, 250001),
slice(500000),
slice(25000, 50000),
slice(25000, 25001),
slice(50000),
slice(1, 10),
slice(10, 20),
slice(15, 24000),
slice(6),
],
)
def test_dataframe_sliced(gdf, slice):
def test_dataframe_sliced(gdf_kwargs, slice):
gdf = cudf.DataFrame(**gdf_kwargs)
pdf = gdf.to_pandas()

actual = gdf[slice]
Expand Down
24 changes: 13 additions & 11 deletions python/cudf/cudf/tests/test_orc.py
Expand Up @@ -16,6 +16,7 @@

import cudf
from cudf.io.orc import ORCWriter
from cudf.testing import assert_frame_equal
from cudf.testing._utils import (
assert_eq,
gen_rand_series,
Expand Down Expand Up @@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
path, engine=engine, columns=columns, use_index=use_index
)

assert_eq(expect, got, check_categorical=False)
assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)


def test_orc_reader_filenotfound(tmpdir):
Expand Down Expand Up @@ -384,11 +385,13 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
else:
print(type(excpr).__name__)

expect = orcfile.read(columns=columns).to_pandas()
cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression)
got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas())
expect.to_orc(gdf_fname.strpath, compression=compression)
got = cudf.from_pandas(
pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
)

assert_eq(expect, got)
assert_frame_equal(expect, got)


@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
Expand All @@ -405,11 +408,11 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
else:
print(type(excpr).__name__)

expect = orcfile.read().to_pandas()
cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq)
got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
expect = cudf.from_pandas(orcfile.read().to_pandas())
expect.to_orc(gdf_fname.strpath, statistics=stats_freq)
got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas())

assert_eq(expect, got)
assert_frame_equal(expect, got)


@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
Expand Down Expand Up @@ -492,8 +495,7 @@ def test_chunked_orc_writer(
writer.close()

got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()

assert_eq(expect, got)
assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))


@pytest.mark.parametrize(
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_parquet.py
Expand Up @@ -1105,9 +1105,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir):
assert_eq(expect, got)


@pytest.mark.parametrize("skip", range(0, 128))
@pytest.mark.parametrize("skip", [0, 1, 5, 10])
def test_parquet_reader_list_skiprows(skip, tmpdir):
num_rows = 128
num_rows = 10
src = pd.DataFrame(
{
"a": list_gen(int_gen, 0, num_rows, 80, 50),
Expand All @@ -1124,9 +1124,9 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize("skip", range(0, 120))
@pytest.mark.parametrize("skip", [0, 1, 5, 10])
def test_parquet_reader_list_num_rows(skip, tmpdir):
num_rows = 128
num_rows = 20
src = pd.DataFrame(
{
"a": list_gen(int_gen, 0, num_rows, 80, 50),
Expand Down
45 changes: 19 additions & 26 deletions python/cudf/cudf/tests/test_repr.py
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

import textwrap

Expand All @@ -13,7 +13,14 @@
from cudf.testing import _utils as utils
from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes

repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
repr_categories = [
"uint16",
"int64",
"float64",
"str",
"category",
"datetime64[ns]",
]


@pytest.mark.parametrize("dtype", repr_categories)
Expand Down Expand Up @@ -84,36 +91,22 @@ def test_full_series(nrows, dtype):
pd.reset_option("display.max_rows")


@pytest.mark.parametrize("nrows", [5, 10, 15])
@pytest.mark.parametrize("ncols", [5, 10, 15])
@pytest.mark.parametrize("size", [20, 21])
@pytest.mark.parametrize("dtype", repr_categories)
@pytest.mark.parametrize("nrows", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1])
@pytest.mark.parametrize("ncols", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1])
def test_full_dataframe_20(dtype, nrows, ncols):
size = 20
pdf = pd.DataFrame(
{idx: np.random.randint(0, 100, size) for idx in range(size)}
).astype(dtype)
gdf = cudf.from_pandas(pdf)

assert pdf.__repr__() == gdf.__repr__()
assert pdf._repr_html_() == gdf._repr_html_()
assert pdf._repr_latex_() == gdf._repr_latex_()


@pytest.mark.parametrize("dtype", repr_categories)
@pytest.mark.parametrize("nrows", [9, 21 / 2, 11, 21 - 1])
@pytest.mark.parametrize("ncols", [9, 21 / 2, 11, 21 - 1])
def test_full_dataframe_21(dtype, nrows, ncols):
size = 21
def test_full_dataframe_20(dtype, size, nrows, ncols):
pdf = pd.DataFrame(
{idx: np.random.randint(0, 100, size) for idx in range(size)}
).astype(dtype)
gdf = cudf.from_pandas(pdf)

pd.options.display.max_rows = int(nrows)
pd.options.display.max_columns = int(ncols)
assert pdf.__repr__() == gdf.__repr__()
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
with pd.option_context(
"display.max_rows", int(nrows), "display.max_columns", int(ncols)
):
assert repr(pdf) == repr(gdf)
assert pdf._repr_html_() == gdf._repr_html_()
assert pdf._repr_latex_() == gdf._repr_latex_()


@given(
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_reshape.py
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

import re

Expand All @@ -17,9 +17,9 @@
)


@pytest.mark.parametrize("num_id_vars", [0, 1, 2, 10])
@pytest.mark.parametrize("num_value_vars", [0, 1, 2, 10])
@pytest.mark.parametrize("num_rows", [1, 2, 1000])
@pytest.mark.parametrize("num_id_vars", [0, 1, 2])
@pytest.mark.parametrize("num_value_vars", [0, 1, 2])
@pytest.mark.parametrize("num_rows", [1, 2, 100])
@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
@pytest.mark.parametrize("nulls", ["none", "some", "all"])
def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
Expand Down

0 comments on commit 851e235

Please sign in to comment.