# Performance comparison

This notebook compares the performance of `cuDF` and `pandas`. The comparisons performed are on identical data sizes. This notebook primarily showcases the factor
of speedups users can have when the similar `pandas` APIs are run on GPUs using `cudf`.

The hardware details used to run these performance comparisons are at the end of this page.

**Note**: This notebook is written to measure performance on NVIDIA GPUs with large memory. If running on hardware with lower memory, please consider lowering the `num_rows` values. Performance results may vary by data size, as well as the CPU and GPU used.

In [None]:
import os
import time
import timeit
from io import BytesIO

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import cudf
print(f"{cudf.__version__=}")

In [None]:
np.random.seed(0)

## Concat, count & joins performance

In [None]:
timeit_number = 30
num_rows = 300_000_000
sub_sample = int(num_rows / 3)

In [None]:
pdf = pd.DataFrame(
    {
        "numbers": np.random.randint(-1000, 1000, num_rows, dtype="int64"),
        "business": np.random.choice(
            ["McD", "Buckees", "Walmart", "Costco"], size=num_rows
        ),
    }
)
pdf

In [None]:
gdf = cudf.from_pandas(pdf)
gdf

In [None]:
def timeit_pandas_cudf(pd_obj, gd_obj, func, **kwargs):
    """
    A utility function to measure execution time of an
    API(`func`) in pandas & cudf.

    Parameters
    ----------
    pd_obj : Pandas object
    gd_obj : cuDF object
    func : callable
    """
    pandas_time = timeit.timeit(lambda: func(pd_obj), **kwargs)
    cudf_time = timeit.timeit(lambda: func(gd_obj), **kwargs)
    return pandas_time, cudf_time

In [None]:
pandas_value_counts, cudf_value_counts = timeit_pandas_cudf(
    pdf, gdf, lambda df: df.value_counts(), number=timeit_number
)

In [None]:
pdf = pdf.head(sub_sample)
gdf = gdf.head(sub_sample)

In [None]:
pandas_concat = timeit.timeit(lambda: pd.concat([pdf, pdf, pdf]), number=timeit_number)

In [None]:
cudf_concat = timeit.timeit(lambda: cudf.concat([gdf, gdf, gdf]), number=timeit_number)

In [None]:
pandas_groupby, cudf_groupby = timeit_pandas_cudf(
    pdf,
    gdf,
    lambda df: df.groupby("business").agg(["min", "max", "mean"]),
    number=timeit_number,
)

In [None]:
pdf = pd.DataFrame(
    {
        "numbers": np.random.randint(-1000, 1000, sub_sample, dtype="int64"),
        "business": np.random.choice(
            ["McD", "Buckees", "Walmart", "Costco"], size=sub_sample
        ),
    }
)
gdf = cudf.from_pandas(pdf)

In [None]:
pandas_merge, cudf_merge = timeit_pandas_cudf(
    pdf, gdf, lambda df: df.merge(df), number=timeit_number
)

In [None]:
performance_df = pd.DataFrame(
    {
        "cudf speedup vs. pandas": [
            pandas_value_counts / cudf_value_counts,
            pandas_concat / cudf_concat,
            pandas_groupby / cudf_groupby,
            pandas_merge / cudf_merge,
        ],
    },
    index=["value_counts", "concat", "groupby", "merge"],
)
performance_df

In [None]:
def performance_plot(df, xlabel=None):
    # ylim is 20% above max value
    ylim_max = df["cudf speedup vs. pandas"].max() + (df["cudf speedup vs. pandas"].max() / 20)
    ax = df.plot.bar(
        color="#7400ff",
        ylim=(1, ylim_max),
        rot=0,
        xlabel=xlabel,
        ylabel="Speedup factor",
    )
    ax.bar_label(ax.containers[0], fmt="%.0f")
    plt.show()

In [None]:
performance_plot(performance_df, xlabel="Operation")

In [None]:
# Cleaning up used memory for later benchmarks
del pdf
del gdf
import gc

_ = gc.collect()

## Strings Performance

In [None]:
timeit_number = 20
num_rows = 300_000_000

In [None]:
pd_series = pd.Series(
    np.random.choice(
        ["123", "56.234", "Walmart", "Costco", "rapids ai"], size=num_rows
    )
)

In [None]:
gd_series = cudf.from_pandas(pd_series)

In [None]:
pandas_upper, cudf_upper = timeit_pandas_cudf(
    pd_series, gd_series, lambda s: s.str.upper(), number=timeit_number
)

In [None]:
pandas_contains, cudf_contains = timeit_pandas_cudf(
    pd_series, gd_series, lambda s: s.str.contains(r"[0-9][a-z]"), number=timeit_number
)

In [None]:
pandas_isalpha, cudf_isalpha = timeit_pandas_cudf(
    pd_series, gd_series, lambda s: s.str.isalpha(), number=timeit_number
)

In [None]:
performance_df = pd.DataFrame(
    {
        "cudf speedup vs. pandas": [
            pandas_upper / cudf_upper,
            pandas_contains / cudf_contains,
            pandas_isalpha / cudf_isalpha,
        ],
    },
    index=["upper", "contains", "isalpha"],
)
performance_df

In [None]:
performance_plot(performance_df, xlabel="String method")

## User-defined function (UDF) performance (with JIT overhead)

The first UDF runs include JIT compilation overhead, due to which the performance of first run and average of next few runs are compared separately.

In [None]:
num_rows = 10_000_000

In [None]:
pdf_age = pd.DataFrame(
    {
        "age": np.random.randint(0, 100, num_rows),
    }
)
pdf_age

In [None]:
gdf_age = cudf.from_pandas(pdf_age)
gdf_age

In [None]:
def age_udf(row):
    if row["age"] < 18:
        return 0
    elif 18 <= row["age"] < 20:
        return 1
    elif 20 <= row["age"] < 30:
        return 2
    elif 30 <= row["age"] < 40:
        return 3
    elif 40 <= row["age"] < 50:
        return 4
    elif 50 <= row["age"] < 60:
        return 5
    elif 60 <= row["age"] < 70:
        return 6
    else:
        return 7

In [None]:
pandas_int_udf, cudf_int_udf = timeit_pandas_cudf(
    pdf_age, gdf_age, lambda df: df.apply(age_udf, axis=1), number=1
)

In [None]:
def str_isupper_udf(row):
    if row.isupper():
        return 0
    else:
        return 1

In [None]:
pd_series = pd.Series(
    np.random.choice(["ABC", "abc", "hello world", "AI"], size=num_rows),
    name="strings",
)
pd_series

In [None]:
gd_series = cudf.from_pandas(pd_series)
gd_series

In [None]:
pandas_str_udf, cudf_str_udf = timeit_pandas_cudf(
    pd_series, gd_series, lambda s: s.apply(str_isupper_udf), number=1
)

In [None]:
performance_df = pd.DataFrame(
    {
        "cudf speedup vs. pandas": [
            pandas_int_udf / cudf_int_udf,
            pandas_str_udf / cudf_str_udf,
        ]
    },
    index=["Numeric", "String"],
)
performance_df

Below is the plot showing performance speedup in case of Numeric UDFs & String UDFs on their first runs.

In [None]:
performance_plot(performance_df, xlabel="UDF Kind")

## User-defined function (UDF) performance (without JIT overhead)

In [None]:
timeit_number = 10

In [None]:
pandas_int_udf, cudf_int_udf = timeit_pandas_cudf(
    pdf_age, gdf_age, lambda df: df.apply(age_udf, axis=1), number=timeit_number
)

In [None]:
pandas_str_udf, cudf_str_udf = timeit_pandas_cudf(
    pd_series, gd_series, lambda s: s.apply(str_isupper_udf), number=timeit_number
)

In [None]:
performance_df = pd.DataFrame(
    {
        "cudf speedup vs. pandas": [
            pandas_int_udf / cudf_int_udf,
            pandas_str_udf / cudf_str_udf,
        ]
    },
    index=["Numeric", "String"],
)
performance_df

Below is the plot showing performance speedup in case of Numeric UDFs & String UDFs on their consequent runs. In this case the speedup is massive because of no JIT overhead present.

In [None]:
performance_plot(performance_df, xlabel="UDF Kind")

## UDF Performance in GroupBy

In [None]:
num_rows = 100_000_000
timeit_number = 10

In [None]:
pdf = pd.DataFrame()
pdf["key"] = np.random.randint(0, 2, num_rows)
pdf["val"] = np.random.randint(0, 7, num_rows)


def custom_formula_udf(df):
    df["out"] = df["key"] * df["val"] - 10
    return df


gdf = cudf.from_pandas(pdf)

In [None]:
pandas_udf_groupby, cudf_udf_groupby = timeit_pandas_cudf(
    pdf,
    gdf,
    lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
    number=timeit_number,
)

In [None]:
performance_df = pd.DataFrame(
    {"cudf speedup vs. pandas": [pandas_udf_groupby / cudf_udf_groupby]},
    index=["Grouped UDF"],
)
performance_df

In [None]:
performance_plot(performance_df)

# System Configuration

## CPU Configuration

In [None]:
!lscpu

## GPU Configuration

In [None]:
!nvidia-smi