Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to support correct comparisons of cuDF Series with different names #1928

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ else
CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension
CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install
echo "Installing $CONDA_FILE"
conda install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE"
gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE"

gpuci_logger "Install the master version of dask and distributed"
pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps
Expand Down
11 changes: 7 additions & 4 deletions python/cugraph/cugraph/tests/dask/test_mg_degree.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
import gc
import pytest
import cudf
import cugraph
import dask_cudf
from cudf.testing import assert_series_equal

import cugraph
from cugraph.dask.common.mg_utils import is_single_gpu
from cugraph.tests.utils import RAPIDS_DATASET_ROOT_DIR_PATH

Expand Down Expand Up @@ -65,6 +67,7 @@ def test_dask_mg_degree(dask_client):
.compute()
)

assert merge_df_in["degree_dg"].equals(merge_df_in["degree_g"])
assert merge_df_out["degree_dg"].equals(
merge_df_out["degree_g"])
assert_series_equal(merge_df_in["degree_dg"], merge_df_in["degree_g"],
check_names=False)
assert_series_equal(merge_df_out["degree_dg"], merge_df_out["degree_g"],
check_names=False)
19 changes: 12 additions & 7 deletions python/cugraph/cugraph/tests/dask/test_mg_renumber.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@

import pandas
import numpy as np

import cugraph.dask as dcg
import cugraph
import dask_cudf
import dask
import cudf
from cudf.testing import assert_series_equal

import cugraph.dask as dcg
import cugraph
from cugraph.tests import utils
from cugraph.structure.number_map import NumberMap
from cugraph.dask.common.mg_utils import is_single_gpu
Expand Down Expand Up @@ -72,10 +73,14 @@ def test_mg_renumber(graph_file, dask_client):
"0_dst", "1_dst"])
unrenumbered_df = unrenumbered_df.reset_index()

assert gdf["src"].equals(unrenumbered_df["0_src"])
assert gdf["src_old"].equals(unrenumbered_df["1_src"])
assert gdf["dst"].equals(unrenumbered_df["0_dst"])
assert gdf["dst_old"].equals(unrenumbered_df["1_dst"])
assert_series_equal(gdf["src"], unrenumbered_df["0_src"],
check_names=False)
assert_series_equal(gdf["src_old"], unrenumbered_df["1_src"],
check_names=False)
assert_series_equal(gdf["dst"], unrenumbered_df["0_dst"],
check_names=False)
assert_series_equal(gdf["dst_old"], unrenumbered_df["1_dst"],
check_names=False)


@pytest.mark.skipif(
Expand Down
35 changes: 10 additions & 25 deletions python/cugraph/cugraph/tests/dask/test_mg_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import gc

import cudf
from cudf.testing import assert_series_equal, assert_frame_equal

import cugraph
import cugraph.dask.structure.replication as replication
Expand Down Expand Up @@ -46,9 +47,7 @@ def test_replicate_cudf_dataframe_with_weights(
worker_to_futures = replication.replicate_cudf_dataframe(df)
for worker in worker_to_futures:
replicated_df = worker_to_futures[worker].result()
assert df.equals(replicated_df), (
"There is a mismatch in one " "of the replications"
)
assert_frame_equal(df, replicated_df)


@pytest.mark.skipif(
Expand All @@ -68,9 +67,7 @@ def test_replicate_cudf_dataframe_no_weights(input_data_path, dask_client):
worker_to_futures = replication.replicate_cudf_dataframe(df)
for worker in worker_to_futures:
replicated_df = worker_to_futures[worker].result()
assert df.equals(replicated_df), (
"There is a mismatch in one " "of the replications"
)
assert_frame_equal(df, replicated_df)


@pytest.mark.skipif(
Expand All @@ -92,9 +89,7 @@ def test_replicate_cudf_series(input_data_path, dask_client):
worker_to_futures = replication.replicate_cudf_series(series)
for worker in worker_to_futures:
replicated_series = worker_to_futures[worker].result()
assert series.equals(replicated_series), (
"There is a " "mismatch in one of the replications"
)
assert_series_equal(series, replicated_series, check_names=False)
# FIXME: If we do not clear this dictionary, when comparing
# results for the 2nd column, one of the workers still
# has a value from the 1st column
Expand Down Expand Up @@ -225,7 +220,7 @@ def test_enable_batch_edgelist_replication(
df = G.edgelist.edgelist_df
for worker in G.batch_edgelists:
replicated_df = G.batch_edgelists[worker].result()
assert df.equals(replicated_df), "Replication of edgelist failed"
assert_frame_equal(df, replicated_df)


@pytest.mark.skipif(
Expand Down Expand Up @@ -257,15 +252,9 @@ def test_enable_batch_adjlist_replication_weights(
weights = adjlist.weights
for worker in G.batch_adjlists:
(rep_offsets, rep_indices, rep_weights) = G.batch_adjlists[worker]
assert offsets.equals(rep_offsets.result()), (
"Replication of " "adjlist offsets failed"
)
assert indices.equals(rep_indices.result()), (
"Replication of " "adjlist indices failed"
)
assert weights.equals(rep_weights.result()), (
"Replication of " "adjlist weights failed"
)
assert_series_equal(offsets, rep_offsets.result(), check_names=False)
assert_series_equal(indices, rep_indices.result(), check_names=False)
assert_series_equal(weights, rep_weights.result(), check_names=False)


@pytest.mark.skipif(
Expand Down Expand Up @@ -295,10 +284,6 @@ def test_enable_batch_adjlist_replication_no_weights(
weights = adjlist.weights
for worker in G.batch_adjlists:
(rep_offsets, rep_indices, rep_weights) = G.batch_adjlists[worker]
assert offsets.equals(rep_offsets.result()), (
"Replication of " "adjlist offsets failed"
)
assert indices.equals(rep_indices.result()), (
"Replication of " "adjlist indices failed"
)
assert_series_equal(offsets, rep_offsets.result(), check_names=False)
assert_series_equal(indices, rep_indices.result(), check_names=False)
assert weights is None and rep_weights is None
6 changes: 5 additions & 1 deletion python/cugraph/cugraph/tests/test_jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import pytest

import cudf
from cudf.testing import assert_series_equal

import cugraph
from cugraph.tests import utils

Expand Down Expand Up @@ -252,4 +254,6 @@ def test_jaccard_multi_column(read_csv):
df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]])

# Calculating mismatch
assert df_res["jaccard_coeff"].equals(df_exp["jaccard_coeff"])
actual = df_res.sort_values("0_source").reset_index()
expected = df_exp.sort_values("source").reset_index()
assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"])
6 changes: 5 additions & 1 deletion python/cugraph/cugraph/tests/test_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import scipy

import cudf
from cudf.testing import assert_series_equal

import cugraph
from cugraph.tests import utils

Expand Down Expand Up @@ -186,4 +188,6 @@ def test_overlap_multi_column(graph_file):
df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]])

# Calculating mismatch
assert df_res["overlap_coeff"].equals(df_exp["overlap_coeff"])
actual = df_res.sort_values("0_source").reset_index()
expected = df_exp.sort_values("source").reset_index()
assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
5 changes: 3 additions & 2 deletions python/cugraph/cugraph/tests/test_random_walks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
# limitations under the License.

import gc
import random

import pytest
from cudf.testing import assert_series_equal

from cugraph.tests import utils
import cugraph
import random


# =============================================================================
Expand Down Expand Up @@ -158,7 +159,7 @@ def test_random_walks_coalesced(
v_offsets = [0] + path_data[2].cumsum()[:-1].to_array().tolist()
w_offsets = [0] + (path_data[2]-1).cumsum()[:-1].to_array().tolist()

assert df['weight_sizes'].equals(path_data[2]-1)
assert_series_equal(df['weight_sizes'], path_data[2]-1, check_names=False)
assert df['vertex_offsets'].to_array().tolist() == v_offsets
assert df['weight_offsets'].to_array().tolist() == w_offsets

Expand Down
45 changes: 25 additions & 20 deletions python/cugraph/cugraph/tests/test_renumber.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pandas as pd
import pytest
import cudf
from cudf.testing import assert_series_equal

from cugraph.structure.number_map import NumberMap
from cugraph.tests import utils
Expand Down Expand Up @@ -53,8 +54,8 @@ def test_renumber_ips():
check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
)["0"]

assert check_src.equals(gdf["source_as_int"])
assert check_dst.equals(gdf["dest_as_int"])
assert_series_equal(check_src, gdf["source_as_int"], check_names=False)
assert_series_equal(check_dst, gdf["dest_as_int"], check_names=False)


def test_renumber_ips_cols():
Expand Down Expand Up @@ -88,8 +89,8 @@ def test_renumber_ips_cols():
check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
)["0"]

assert check_src.equals(gdf["source_as_int"])
assert check_dst.equals(gdf["dest_as_int"])
assert_series_equal(check_src, gdf["source_as_int"], check_names=False)
assert_series_equal(check_dst, gdf["dest_as_int"], check_names=False)


@pytest.mark.skip(reason="temporarily dropped string support")
Expand Down Expand Up @@ -121,8 +122,8 @@ def test_renumber_ips_str_cols():
check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
)["0"]

assert check_src.equals(gdf["source_list"])
assert check_dst.equals(gdf["dest_list"])
assert_series_equal(check_src, gdf["source_list"], check_names=False)
assert_series_equal(check_dst, gdf["dest_list"], check_names=False)


def test_renumber_negative():
Expand All @@ -142,8 +143,8 @@ def test_renumber_negative():
check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
)["0"]

assert check_src.equals(gdf["source_list"])
assert check_dst.equals(gdf["dest_list"])
assert_series_equal(check_src, gdf["source_list"], check_names=False)
assert_series_equal(check_dst, gdf["dest_list"], check_names=False)


def test_renumber_negative_col():
Expand All @@ -163,8 +164,8 @@ def test_renumber_negative_col():
check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
)["0"]

assert check_src.equals(gdf["source_list"])
assert check_dst.equals(gdf["dest_list"])
assert_series_equal(check_src, gdf["source_list"], check_names=False)
assert_series_equal(check_dst, gdf["dest_list"], check_names=False)


@pytest.mark.skip(reason="dropped renumbering from series support")
Expand Down Expand Up @@ -200,8 +201,8 @@ def test_renumber_series(graph_file):
check_dst = numbering_series_2.from_internal_vertex_id(renumbered_dst,
"dst_id")

assert check_src["0_y"].equals(check_src["0_x"])
assert check_dst["0_y"].equals(check_dst["0_x"])
assert_series_equal(check_src["0_y"], check_src["0_x"], check_names=False)
assert_series_equal(check_dst["0_y"], check_dst["0_x"], check_names=False)


@pytest.mark.parametrize("graph_file", utils.DATASETS)
Expand Down Expand Up @@ -233,8 +234,8 @@ def test_renumber_files(graph_file):
unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst",
preserve_order=True)

assert exp_src.equals(unrenumbered_df["src"])
assert exp_dst.equals(unrenumbered_df["dst"])
assert_series_equal(exp_src, unrenumbered_df["src"], check_names=False)
assert_series_equal(exp_dst, unrenumbered_df["dst"], check_names=False)


@pytest.mark.parametrize("graph_file", utils.DATASETS)
Expand Down Expand Up @@ -265,8 +266,8 @@ def test_renumber_files_col(graph_file):
unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst",
preserve_order=True)

assert exp_src.equals(unrenumbered_df["src"])
assert exp_dst.equals(unrenumbered_df["dst"])
assert_series_equal(exp_src, unrenumbered_df["src"], check_names=False)
assert_series_equal(exp_dst, unrenumbered_df["dst"], check_names=False)


@pytest.mark.parametrize("graph_file", utils.DATASETS)
Expand Down Expand Up @@ -295,7 +296,11 @@ def test_renumber_files_multi_col(graph_file):
unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst",
preserve_order=True)

assert gdf["src"].equals(unrenumbered_df["0_src"])
assert gdf["src_old"].equals(unrenumbered_df["1_src"])
assert gdf["dst"].equals(unrenumbered_df["0_dst"])
assert gdf["dst_old"].equals(unrenumbered_df["1_dst"])
assert_series_equal(gdf["src"], unrenumbered_df["0_src"],
check_names=False)
assert_series_equal(gdf["src_old"], unrenumbered_df["1_src"],
check_names=False)
assert_series_equal(gdf["dst"], unrenumbered_df["0_dst"],
check_names=False)
assert_series_equal(gdf["dst_old"], unrenumbered_df["1_dst"],
check_names=False)
6 changes: 5 additions & 1 deletion python/cugraph/cugraph/tests/test_sorensen.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import pytest

import cudf
from cudf.testing import assert_series_equal

import cugraph
from cugraph.tests import utils

Expand Down Expand Up @@ -241,4 +243,6 @@ def test_sorensen_multi_column(read_csv):
df_exp = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]])

# Calculating mismatch
assert df_res["sorensen_coeff"].equals(df_exp["sorensen_coeff"])
actual = df_res.sort_values("0_source").reset_index()
expected = df_exp.sort_values("source").reset_index()
assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])
6 changes: 5 additions & 1 deletion python/cugraph/cugraph/tests/test_wjaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import pytest

import cudf
from cudf.testing import assert_series_equal

import cugraph
from cugraph.tests import utils

Expand Down Expand Up @@ -168,4 +170,6 @@ def test_wjaccard_multi_column(read_csv):
df_exp = cugraph.jaccard_w(G2, weights, vertex_pair[["src_0", "dst_0"]])

# Calculating mismatch
assert df_res["jaccard_coeff"].equals(df_exp["jaccard_coeff"])
actual = df_res.sort_values("0_source").reset_index()
expected = df_exp.sort_values("source").reset_index()
assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"])
5 changes: 4 additions & 1 deletion python/cugraph/cugraph/tests/test_woverlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import scipy
import numpy as np
import cudf
from cudf.testing import assert_series_equal

import cugraph
from cugraph.tests import utils
Expand Down Expand Up @@ -157,4 +158,6 @@ def test_woverlap_multi_column(graph_file):
df_exp = cugraph.overlap_w(G2, weights, vertex_pair[["src_0", "dst_0"]])

# Calculating mismatch
assert df_res["overlap_coeff"].equals(df_exp["overlap_coeff"])
actual = df_res.sort_values("0_source").reset_index()
expected = df_exp.sort_values("source").reset_index()
assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
6 changes: 5 additions & 1 deletion python/cugraph/cugraph/tests/test_wsorensen.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import pytest

import cudf
from cudf.testing import assert_series_equal

import cugraph
from cugraph.tests import utils

Expand Down Expand Up @@ -172,4 +174,6 @@ def test_wsorensen_multi_column(read_csv):
df_exp = cugraph.sorensen_w(G2, weights, vertex_pair[["src_0", "dst_0"]])

# Calculating mismatch
assert df_res["sorensen_coeff"].equals(df_exp["sorensen_coeff"])
actual = df_res.sort_values("0_source").reset_index()
expected = df_exp.sort_values("source").reset_index()
assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])