In [16]:
from torch.utils.cpp_extension import load
import torch
import random
import statistics
import matplotlib.pyplot as plt

from collections import defaultdict

bench_shared = load(name="bench_shared", sources=["cpp/bench_shared_entry.cpp", "cpp/bench_shared.cu"], extra_cuda_cflags=["--keep", "--keep-dir", "/workspace/benchmark/temp", "--extended-lambda"], verbose=True)

outs = torch.zeros((1024*32), dtype=torch.int32, device="cuda")

results = defaultdict(lambda:defaultdict(list))

dtypes = ["float32", "int32"]#, "uint32", "half2", "double", "int64"]
ops = ["add", "inc", "max", "xor", "or", "exch", "mul", "add_manual", "max_manual", "donothing_manual", "add_nochange", "add_warpcoalesced"]

strat = 0 # TIDX

blocks = torch.cuda.get_device_properties(0).multi_processor_count

our_n_threads = 128

for strat in (0, 1):
    for op, op_name in enumerate(ops):
        if op_name not in ("mul", "add", "add_manual", "add_nochange"): continue
        for dtype_enum, dtype in enumerate(dtypes):
            if bench_shared.bench_shared(dtype_enum, outs, op, 128, 128, 0) == 1:
                continue
            # if dtype not in ("int32", "inc", "float32"): continue
            for n_threads in (32, 64, 128, 256, 512, 1024):
                if n_threads not in (our_n_threads,): continue
                for shmem_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
                    if shmem_size > n_threads: continue
                    clocks = []
                    for i in range(100):
                        outs.zero_()
                        result = bench_shared.bench_shared(dtype_enum, outs, op, shmem_size, n_threads, strat)
                        if result == 1:
                            print(f"No kernel for ")
                            break

                        outs_list = [int(a) for a in outs[:(n_threads//32)*blocks].tolist()]
                        if outs_list.count(-1) != 0:
                            print(f"FOUND {outs_list.count(-1)} -1 VALUES!!!")
                            print(outs_list)
                            raise AssertionError
                        elif outs_list.count(0) != 0:
                            print(f"FOUND {outs_list.count(0)} 0 VALUES")
                            print(outs_list)
                            raise AssertionError

                        clocks.extend(outs_list)
                    if not clocks: continue
                    mean = int(statistics.mean(clocks))
                    print(f"FOR {dtype=}\top={op_name}\titerations=512\t{shmem_size=}\t{n_threads=}\tmean: {mean}")
                    results[n_threads][f"{op_name}_{dtype}_{strat}"].append(mean)
                    if mean < 1000:
                        pass # print("\n\nTHIS OP AND DTYPE ARE PROBABLY EMPTY\n\n")

                # print(f"{results=}")

results = {key: dict(results[key]) for key in results}

print(f"{results[our_n_threads].keys()=}")

# third figure
import math
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 10))  # Set a larger figure size for better legibility

plt.plot([a / 512 for a in results[our_n_threads]['add_float32_1']], label=f"float32 add", color="blue")
plt.plot([a / 512 for a in results[our_n_threads]['mul_float32_1']], label=f"float32 mul", color="orange")
plt.plot([a / 512 for a in results[our_n_threads]['add_manual_float32_1']], label=f"float32 add manual", color="green")
plt.plot([a / 512 for a in results[our_n_threads]['add_nochange_float32_1']], label=f"float32 add nochange", color="yellow")
plt.plot([a / 512 for a in results[our_n_threads]['add_int32_1']], label=f"int32 add", color="red")

num_ticks = int(math.log(our_n_threads, 2)+1)

device = "RTX 4090" if "4090" in torch.cuda.get_device_name() else torch.cuda.get_device_name()
plt.title(f"Shared memory atomic add throughput on {device} (n_threads={our_n_threads})", fontsize=16)
plt.ylabel("Clock cycles per iteration", fontsize=14)
plt.xlabel("Contending threads", fontsize=14)
plt.yscale("log")
plt.xticks(list(range(num_ticks)), [math.ceil(our_n_threads / (2**i)) for i in range(num_ticks)], fontsize=12)
plt.yticks([10, 100, 1000, 10000], ["10", "100", "1,000", "10,000"], fontsize=12)
plt.grid(True, which="both", linestyle='--', linewidth=0.5)

# Adjust x-axis limits to remove whitespace
plt.xlim(0, num_ticks - 1)

plt.legend(loc='upper right', bbox_to_anchor=(1, 1), fontsize='large', ncol=1)

plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust layout to make space for the legends

plt.show()

Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
The input conditions for extension module bench_shared have changed. Bumping to version 5 and re-building as bench_shared_v5...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/bench_shared/build.ninja...
Building extension module bench_shared_v5...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=bench_shared_v5 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 --compiler-options '-fPIC' --keep --keep-dir /workspace/benchmark/temp --extended-lambda -std=c++17 -c /workspace/benchmark/cpp/bench_shared.cu -o bench_shared.cuda.o 


[2/3] c++ -MMD -MF be

Loading extension module bench_shared_v5...


KeyError: 128

In [14]:
print(f"{results[our_n_threads]['add_float32_1']=}")
print(f"{results[our_n_threads]['add_nochange_float32_1']=}")

results[our_n_threads]['add_float32_1']=[5741861, 2640494, 1298197, 752369, 466275, 325790, 295642, 276991]
results[our_n_threads]['add_nochange_float32_1']=[5741861, 2625372, 1293768, 750373, 463591, 326207, 295566, 276757]


In [None]:
# first figure
import math
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 10))  # Set a larger figure size for better legibility

plt.plot([a / 512 for a in results[our_n_threads]['add_float32']], label=f"float32", color="blue")
plt.plot([a / 512 for a in results[our_n_threads]['add_int32']], label=f"int32", color="red")

device = "RTX 4090" if "4090" in torch.cuda.get_device_name() else torch.cuda.get_device_name()
plt.title(f"Shared memory atomic add throughput on {device} (n_threads=256)", fontsize=16)
plt.ylabel("Clock cycles per iteration", fontsize=14)
plt.xlabel("Contending threads", fontsize=14)
plt.yscale("log")
plt.xticks(list(range(num_ticks)), [math.ceil(256 / (2**i)) for i in range(num_ticks)], fontsize=12)
plt.yticks([10, 100, 1000, 10000], ["10", "100", "1,000", "10,000"], fontsize=12)
plt.grid(True, which="both", linestyle='--', linewidth=0.5)

# Adjust x-axis limits to remove whitespace
plt.xlim(0, num_ticks - 1)

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize='small', ncol=1)

plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust layout to make space for the legends

plt.show()

In [None]:
# second figure
import math
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 10))  # Set a larger figure size for better legibility

# plt.plot([a / 1024 for a in results[256]['max_int32']], label=f"max int32", color="green")
plt.plot([a / 512 for a in results[our_n_threads]['max_float32']], label=f"max float32 from int32", color="blue")
plt.plot([a / 512 for a in results[our_n_threads]['max_manual_float32']], label=f"max float32 with CAS", color="red")
plt.plot([a / 512 for a in results[our_n_threads]['donothing_manual']], label=f"CAS do nothing", color="green")

device = "RTX 4090" if "4090" in torch.cuda.get_device_name() else torch.cuda.get_device_name()
plt.title(f"Shared memory atomic max throughput on {device} (n_threads=256)", fontsize=16)
plt.ylabel("Clock cycles per iteration", fontsize=14)
plt.xlabel("Contending threads", fontsize=14)
plt.yscale("log")
plt.xticks(list(range(num_ticks)), [math.ceil(our_n_threads / (2**i)) for i in range(num_ticks)], fontsize=12)
plt.yticks([10, 100, 1000, 10000], ["10", "100", "1,000", "10,000"], fontsize=12)
plt.grid(True, which="both", linestyle='--', linewidth=0.5)

# Adjust x-axis limits to remove whitespace
plt.xlim(0, 9 - 1)

plt.legend(loc='upper right', bbox_to_anchor=(1, 1), fontsize='large', ncol=1)

plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust layout to make space for the legends

plt.show()

In [None]:
# third figure
import math
import matplotlib.pyplot as plt

for strat in (0, 1):
    plt.figure(figsize=(14, 10))  # Set a larger figure size for better legibility

    plt.plot([a / 512 for a in results[our_n_threads][f'add_float32_{strat}']], label=f"float32 add", color="blue")
    plt.plot([a / 512 for a in results[our_n_threads][f'mul_float32_{strat}']], label=f"float32 mul", color="orange")
    plt.plot([a / 512 for a in results[our_n_threads][f'add_manual_float32_{strat}']], label=f"float32 add manual", color="green")
    plt.plot([a / 512 for a in results[our_n_threads][f'add_nochange_float32_{strat}']], label=f"float32 add nochange", color="yellow")
    plt.plot([a / 512 for a in results[our_n_threads][f'add_int32_{strat}']], label=f"int32 add", color="red")


    device = "RTX 4090" if "4090" in torch.cuda.get_device_name() else torch.cuda.get_device_name()
    plt.title(f"Shared memory atomic add throughput on {device} (n_threads=128) {strat=}", fontsize=16)
    plt.ylabel("Clock cycles per iteration", fontsize=14)
    plt.xlabel("Contending threads", fontsize=14)
    plt.yscale("log")
    plt.xticks(list(range(num_ticks)), [math.ceil(our_n_threads / (2**i)) for i in range(num_ticks)], fontsize=12)
    plt.yticks([10, 100, 1000, 10000], ["10", "100", "1,000", "10,000"], fontsize=12)
    plt.grid(True, which="both", linestyle='--', linewidth=0.5)

    # Adjust x-axis limits to remove whitespace
    plt.xlim(0, num_ticks - 1)

    plt.legend(loc='upper right', bbox_to_anchor=(1, 1), fontsize='large', ncol=1)

    plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust layout to make space for the legends

    plt.show()

In [None]:
results[256]['max_manual_float32']

In [None]:
results[256]['donothing_manual_int32']

In [None]:
results[256].keys()