Skip to content

Commit 2e05c55

Browse files
committed
Update
[ghstack-poisoned]
2 parents 51ef97d + 8a34ec0 commit 2e05c55

File tree

246 files changed

+5848
-5105
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

246 files changed

+5848
-5105
lines changed

.github/actions/diskspace-cleanup/action.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ runs:
2727
docker system prune -af
2828
diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
2929
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
30-
echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
30+
diskspace_cutoff_int=$((diskspace_cutoff + 0))
31+
difference=$((100 - diskspace_cutoff_int))
32+
echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
3133
echo "$msg"
3234
exit 1
3335
else

.github/workflows/build-almalinux-images.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
runs-on: linux.9xlarge.ephemeral
3737
strategy:
3838
matrix:
39-
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
39+
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "rocm7.1", "cpu"]
4040
steps:
4141
- name: Build docker image
4242
uses: pytorch/pytorch/.github/actions/binary-docker-build@main

.github/workflows/build-libtorch-images.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ jobs:
5454
{ tag: "cuda12.6" },
5555
{ tag: "rocm6.4" },
5656
{ tag: "rocm7.0" },
57+
{ tag: "rocm7.1" },
5758
{ tag: "cpu" },
5859
]
5960
steps:

.github/workflows/build-manywheel-images.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ jobs:
5656
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
5757
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
5858
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
59+
{ name: "manylinux2_28-builder", tag: "rocm7.1", runner: "linux.9xlarge.ephemeral" },
5960
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
6061
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
6162
{ name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" },

aten/src/ATen/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
260260
if(USE_CUDA)
261261
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
262262
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
263-
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped).*")
263+
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped|f4f4bf16).*")
264264
file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
265265
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
266266
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")

aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ class Vectorized<float> {
309309
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
310310
// Implementation copied from Arm Optimized Routine
311311
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
312-
Vectorized<float> exp_u20() const {
312+
inline Vectorized<float> vexpq_f32_u20() const {
313313
// bail out to sleef if it's a special case:
314314
// i.e. there's an input s.t. |input| > 87.3....
315315
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
@@ -348,6 +348,9 @@ class Vectorized<float> {
348348

349349
return vfmaq_f32(scale, poly, scale);
350350
}
351+
Vectorized<float> exp_u20() const {
352+
return vexpq_f32_u20();
353+
}
351354
Vectorized<float> fexp_u20() const {
352355
return exp_u20();
353356
}
@@ -634,7 +637,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {
634637
// - exp(- x * x)
635638
auto pow_2 = (*this) * (*this);
636639
auto neg_pow_2 = pow_2 ^ neg_zero_vec;
637-
auto tmp4 = neg_pow_2.exp();
640+
auto tmp4 = neg_pow_2.vexpq_f32_u20();
638641
auto tmp5 = tmp4 ^ neg_zero_vec;
639642
// erf(x) = sign(x) * (1 - r * t * exp(- x * x))
640643
auto tmp6 = t * tmp5;

aten/src/ATen/cuda/CUDAGreenContext.cpp

Lines changed: 77 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,98 @@
11
#include <ATen/cuda/CUDAGreenContext.h>
22

3-
namespace at::cuda {
4-
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
5-
#if CUDA_HAS_GREEN_CONTEXT
6-
int driver_version;
7-
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
8-
TORCH_CHECK(
9-
driver_version >= 12080, "cuda driver too old to use green context!");
10-
CUcontext pctx = nullptr;
11-
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
12-
if (C10_UNLIKELY(!pctx)) {
13-
TORCH_WARN(
14-
"Attempted to create a green context but"
15-
" there was no primary context! Creating a primary context...");
16-
17-
cudaFree(0);
18-
}
19-
20-
CUdevice device;
21-
device_id_ = device_id;
22-
C10_CUDA_DRIVER_CHECK(
23-
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
24-
25-
// Get device resources
26-
CUdevResource device_resource;
27-
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
28-
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
29-
30-
// Split resources
31-
std::vector<CUdevResource> result(1);
32-
auto result_data = result.data();
33-
unsigned int nb_groups = 1;
34-
CUdevResource remaining;
3+
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
4+
#include <c10/cuda/driver_api.h>
5+
#include <stdexcept>
6+
#include <vector>
7+
#define HAS_CUDA_GREEN_CONTEXT() 1
8+
#else
9+
#define HAS_CUDA_GREEN_CONTEXT() 0
10+
// Suppress unsued private field warnings as this class is not supposed to be called
11+
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-private-field")
12+
#endif
3513

36-
C10_CUDA_DRIVER_CHECK(
37-
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
38-
result_data,
39-
&nb_groups,
40-
&device_resource,
41-
&remaining,
42-
0, // default flags
43-
num_sms));
44-
45-
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
46-
47-
// Generate resource descriptor
48-
CUdevResourceDesc desc;
49-
C10_CUDA_DRIVER_CHECK(
50-
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
51-
&desc, result_data, 1));
14+
namespace at::cuda {
5215

53-
// Create green context
54-
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
55-
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
56-
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
57-
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
16+
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
17+
#if HAS_CUDA_GREEN_CONTEXT()
18+
int driver_version;
19+
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
20+
TORCH_CHECK(
21+
driver_version >= 12080, "cuda driver too old to use green context!");
22+
CUcontext pctx = nullptr;
23+
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
24+
if (C10_UNLIKELY(!pctx)) {
25+
TORCH_WARN(
26+
"Attempted to create a green context but"
27+
" there was no primary context! Creating a primary context...");
28+
29+
cudaFree(0);
30+
}
5831

59-
// Convert to regular context
60-
C10_CUDA_DRIVER_CHECK(
61-
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
62-
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
32+
CUdevice device;
33+
device_id_ = device_id;
34+
C10_CUDA_DRIVER_CHECK(
35+
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
36+
37+
// Get device resources
38+
CUdevResource device_resource;
39+
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
40+
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
41+
42+
// Split resources
43+
std::vector<CUdevResource> result(1);
44+
auto result_data = result.data();
45+
unsigned int nb_groups = 1;
46+
CUdevResource remaining;
47+
48+
C10_CUDA_DRIVER_CHECK(
49+
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
50+
result_data,
51+
&nb_groups,
52+
&device_resource,
53+
&remaining,
54+
0, // default flags
55+
num_sms));
56+
57+
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
58+
59+
// Generate resource descriptor
60+
CUdevResourceDesc desc;
61+
C10_CUDA_DRIVER_CHECK(
62+
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
63+
&desc, result_data, 1));
64+
65+
// Create green context
66+
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
67+
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
68+
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
69+
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
70+
71+
// Convert to regular context
72+
C10_CUDA_DRIVER_CHECK(
73+
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
74+
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
6375
#else
64-
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
76+
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
6577
#endif
6678
}
6779

6880
std::unique_ptr<GreenContext> GreenContext::create(
6981
uint32_t num_sms,
7082
std::optional<uint32_t> device_id) {
71-
#if CUDA_HAS_GREEN_CONTEXT
83+
#if HAS_CUDA_GREEN_CONTEXT()
7284
if (!device_id.has_value()) {
7385
device_id = at::cuda::current_device();
7486
}
75-
return std::make_unique<GreenContext>(device_id.value(), num_sms);
87+
return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
7688
#else
7789
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
7890
#endif
7991
}
8092

8193
// Implement move operations
8294
GreenContext::GreenContext(GreenContext&& other) noexcept{
83-
#if CUDA_HAS_GREEN_CONTEXT
95+
#if HAS_CUDA_GREEN_CONTEXT()
8496
device_id_ = std::exchange(other.device_id_, -1);
8597
green_ctx_ = std::exchange(other.green_ctx_, nullptr);
8698
context_ = std::exchange(other.context_, nullptr);
@@ -91,7 +103,7 @@ namespace at::cuda {
91103
}
92104

93105
GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
94-
#if CUDA_HAS_GREEN_CONTEXT
106+
#if HAS_CUDA_GREEN_CONTEXT()
95107
if (this != &other) {
96108
// Clean up current resources
97109
if (green_ctx_) {
@@ -120,33 +132,17 @@ namespace at::cuda {
120132
}
121133

122134
GreenContext::~GreenContext() noexcept{
123-
#if CUDA_HAS_GREEN_CONTEXT
135+
#if HAS_CUDA_GREEN_CONTEXT()
124136
C10_CUDA_DRIVER_CHECK(
125137
c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
126138
#else
127139
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
128140
#endif
129141
}
130142

131-
// Get the underlying CUDA context
132-
CUcontext GreenContext::getContext() const {
133-
#if CUDA_HAS_GREEN_CONTEXT
134-
return context_;
135-
#else
136-
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
137-
#endif
138-
}
139-
140-
// Get the underlying green context
141-
#if CUDA_HAS_GREEN_CONTEXT
142-
CUgreenCtx GreenContext::getGreenContext() const {
143-
return green_ctx_;
144-
}
145-
#endif
146-
147143
// Make this context current
148144
void GreenContext::setContext() {
149-
#if CUDA_HAS_GREEN_CONTEXT
145+
#if HAS_CUDA_GREEN_CONTEXT()
150146
auto current_stream = c10::cuda::getCurrentCUDAStream();
151147
parent_stream_ = current_stream.stream();
152148

@@ -175,7 +171,7 @@ namespace at::cuda {
175171
}
176172

177173
void GreenContext::popContext() {
178-
#if CUDA_HAS_GREEN_CONTEXT
174+
#if HAS_CUDA_GREEN_CONTEXT()
179175
// see above note about stream being hardcoded to the default stream
180176
at::cuda::CUDAEvent ev;
181177
ev.record(c10::cuda::getCurrentCUDAStream());
Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,38 @@
11
#pragma once
22
#include <ATen/cuda/CUDAEvent.h>
3-
4-
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
5-
#include <c10/cuda/driver_api.h>
63
#include <cuda.h>
7-
#include <memory>
8-
#include <stdexcept>
9-
#include <vector>
10-
#define CUDA_HAS_GREEN_CONTEXT 1
11-
#else
12-
#define CUDA_HAS_GREEN_CONTEXT 0
13-
#endif
4+
5+
// Forward declare green context as opaque ptr
6+
typedef struct CUgreenCtx_st* CUgreenCtx;
147

158
namespace at::cuda {
169

1710
class TORCH_CUDA_CPP_API GreenContext {
1811
public:
19-
GreenContext(uint32_t device_id, uint32_t num_sms);
20-
21-
static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
12+
// Green context creation
13+
static std::unique_ptr<GreenContext> create(
14+
uint32_t num_sms,
15+
std::optional<uint32_t> device_id);
16+
~GreenContext() noexcept;
2217

2318
// Delete copy constructor and assignment
2419
GreenContext(const GreenContext&) = delete;
2520
GreenContext& operator=(const GreenContext&) = delete;
2621

27-
// Implement move operations
28-
GreenContext(GreenContext&& other) noexcept;
29-
GreenContext& operator=(GreenContext&& other) noexcept;
30-
~GreenContext() noexcept;
31-
32-
// Get the underlying CUDA context
33-
CUcontext getContext() const;
34-
35-
// Get the underlying green context
36-
#if CUDA_HAS_GREEN_CONTEXT
37-
CUgreenCtx getGreenContext() const;
38-
#endif
39-
4022
// Make this context current
4123
void setContext();
4224

4325
void popContext();
4426

4527
private:
46-
#if CUDA_HAS_GREEN_CONTEXT
28+
GreenContext(uint32_t device_id, uint32_t num_sms);
29+
// Implement move operations
30+
GreenContext(GreenContext&& other) noexcept;
31+
GreenContext& operator=(GreenContext&& other) noexcept;
32+
4733
int32_t device_id_ = -1;
4834
CUgreenCtx green_ctx_ = nullptr;
4935
CUcontext context_ = nullptr;
5036
cudaStream_t parent_stream_ = nullptr;
51-
#endif
5237
};
5338
} // namespace at::cuda

aten/src/ATen/cuda/CUDASparse.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,6 @@
77
#endif
88

99

10-
#if defined(USE_ROCM)
11-
// hipSparse const API added in v2.4.0
12-
#if HIPSPARSE_VERSION >= 200400
13-
#define AT_USE_HIPSPARSE_GENERIC_API() 1
14-
#else
15-
#define AT_USE_HIPSPARSE_GENERIC_API() 1
16-
#endif
17-
#else // USE_ROCM
18-
#define AT_USE_HIPSPARSE_GENERIC_API() 0
19-
#endif // USE_ROCM
20-
2110
// cuSparse Generic API spsv function was added in CUDA 11.3.0
2211
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
2312
#define AT_USE_CUSPARSE_GENERIC_SPSV() 1

0 commit comments

Comments
 (0)