From c9cf6897cdeb8c90ff087be93a5d3fe5bd5fe3ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Mon, 8 Dec 2025 20:47:53 +0100 Subject: [PATCH 1/2] Split runtime global logic and cache kernel specific one --- compiler/rustc_codegen_llvm/src/base.rs | 10 +- .../src/builder/gpu_offload.rs | 186 ++++++++++++------ compiler/rustc_codegen_llvm/src/context.rs | 9 + compiler/rustc_codegen_llvm/src/intrinsic.rs | 18 +- compiler/rustc_feature/src/builtin_attrs.rs | 2 +- tests/codegen-llvm/gpu_offload/gpu_host.rs | 18 +- 6 files changed, 159 insertions(+), 84 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/base.rs b/compiler/rustc_codegen_llvm/src/base.rs index 6cbddfec46318..868922c128cb7 100644 --- a/compiler/rustc_codegen_llvm/src/base.rs +++ b/compiler/rustc_codegen_llvm/src/base.rs @@ -23,13 +23,14 @@ use rustc_middle::dep_graph; use rustc_middle::middle::codegen_fn_attrs::{CodegenFnAttrs, SanitizerFnAttrs}; use rustc_middle::mir::mono::Visibility; use rustc_middle::ty::TyCtxt; -use rustc_session::config::DebugInfo; +use rustc_session::config::{DebugInfo, Offload}; use rustc_span::Symbol; use rustc_target::spec::SanitizerSet; use super::ModuleLlvm; use crate::attributes; use crate::builder::Builder; +use crate::builder::gpu_offload::OffloadGlobals; use crate::context::CodegenCx; use crate::llvm::{self, Value}; @@ -85,6 +86,13 @@ pub(crate) fn compile_codegen_unit( let llvm_module = ModuleLlvm::new(tcx, cgu_name.as_str()); { let mut cx = CodegenCx::new(tcx, cgu, &llvm_module); + + if cx.sess().opts.unstable_opts.offload.contains(&Offload::Enable) + && !cx.sess().target.is_like_gpu + { + cx.offload_globals.replace(Some(OffloadGlobals::declare(&cx))); + } + let mono_items = cx.codegen_unit.items_in_deterministic_order(cx.tcx); for &(mono_item, data) in &mono_items { mono_item.predefine::>( diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 5d1ddd057d88a..f28037fb9cff6 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -2,17 +2,76 @@ use std::ffi::CString; use llvm::Linkage::*; use rustc_abi::Align; -use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; +use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods}; use rustc_middle::ty::offload_meta::OffloadMetadata; -use crate::builder::SBuilder; +use crate::builder::Builder; +use crate::common::CodegenCx; use crate::llvm::AttributePlace::Function; -use crate::llvm::{self, BasicBlock, Linkage, Type, Value}; +use crate::llvm::{self, Linkage, Type, Value}; use crate::{SimpleCx, attributes}; +// LLVM kernel-independent globals required for offloading +pub(crate) struct OffloadGlobals<'ll> { + pub launcher_fn: &'ll llvm::Value, + pub launcher_ty: &'ll llvm::Type, + + pub bin_desc: &'ll llvm::Type, + + pub kernel_args_ty: &'ll llvm::Type, + + pub offload_entry_ty: &'ll llvm::Type, + + pub begin_mapper: &'ll llvm::Value, + pub end_mapper: &'ll llvm::Value, + pub mapper_fn_ty: &'ll llvm::Type, + + pub ident_t_global: &'ll llvm::Value, + + pub register_lib: &'ll llvm::Value, + pub unregister_lib: &'ll llvm::Value, + pub init_rtls: &'ll llvm::Value, +} + +impl<'ll> OffloadGlobals<'ll> { + pub(crate) fn declare(cx: &CodegenCx<'ll, '_>) -> Self { + let (launcher_fn, launcher_ty) = generate_launcher(cx); + let kernel_args_ty = KernelArgsTy::new_decl(cx); + let offload_entry_ty = TgtOffloadEntry::new_decl(cx); + let (begin_mapper, _, end_mapper, mapper_fn_ty) = gen_tgt_data_mappers(cx); + let ident_t_global = generate_at_one(cx); + + let tptr = cx.type_ptr(); + let ti32 = cx.type_i32(); + let tgt_bin_desc_ty = vec![ti32, tptr, tptr, tptr]; + let bin_desc = cx.type_named_struct("struct.__tgt_bin_desc"); + cx.set_struct_body(bin_desc, &tgt_bin_desc_ty, false); + + let register_lib = declare_offload_fn(&cx, "__tgt_register_lib", mapper_fn_ty); + let unregister_lib = declare_offload_fn(&cx, "__tgt_unregister_lib", mapper_fn_ty); + let init_ty = cx.type_func(&[], cx.type_void()); + let init_rtls = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty); + + OffloadGlobals { + launcher_fn, + launcher_ty, + bin_desc, + kernel_args_ty, + offload_entry_ty, + begin_mapper, + end_mapper, + mapper_fn_ty, + ident_t_global, + register_lib, + unregister_lib, + init_rtls, + } + } +} + // ; Function Attrs: nounwind // declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2 -fn generate_launcher<'ll>(cx: &'ll SimpleCx<'_>) -> (&'ll llvm::Value, &'ll llvm::Type) { +fn generate_launcher<'ll>(cx: &CodegenCx<'ll, '_>) -> (&'ll llvm::Value, &'ll llvm::Type) { let tptr = cx.type_ptr(); let ti64 = cx.type_i64(); let ti32 = cx.type_i32(); @@ -30,7 +89,7 @@ fn generate_launcher<'ll>(cx: &'ll SimpleCx<'_>) -> (&'ll llvm::Value, &'ll llvm // @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8 // FIXME(offload): @0 should include the file name (e.g. lib.rs) in which the function to be // offloaded was defined. -fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value { +pub(crate) fn generate_at_one<'ll>(cx: &CodegenCx<'ll, '_>) -> &'ll llvm::Value { let unknown_txt = ";unknown;unknown;0;0;;"; let c_entry_name = CString::new(unknown_txt).unwrap(); let c_val = c_entry_name.as_bytes_with_nul(); @@ -68,7 +127,7 @@ pub(crate) struct TgtOffloadEntry { } impl TgtOffloadEntry { - pub(crate) fn new_decl<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Type { + pub(crate) fn new_decl<'ll>(cx: &CodegenCx<'ll, '_>) -> &'ll llvm::Type { let offload_entry_ty = cx.type_named_struct("struct.__tgt_offload_entry"); let tptr = cx.type_ptr(); let ti64 = cx.type_i64(); @@ -82,7 +141,7 @@ impl TgtOffloadEntry { } fn new<'ll>( - cx: &'ll SimpleCx<'_>, + cx: &CodegenCx<'ll, '_>, region_id: &'ll Value, llglobal: &'ll Value, ) -> [&'ll Value; 9] { @@ -126,7 +185,7 @@ impl KernelArgsTy { const OFFLOAD_VERSION: u64 = 3; const FLAGS: u64 = 0; const TRIPCOUNT: u64 = 0; - fn new_decl<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll Type { + fn new_decl<'ll>(cx: &CodegenCx<'ll, '_>) -> &'ll Type { let kernel_arguments_ty = cx.type_named_struct("struct.__tgt_kernel_arguments"); let tptr = cx.type_ptr(); let ti64 = cx.type_i64(); @@ -140,8 +199,8 @@ impl KernelArgsTy { kernel_arguments_ty } - fn new<'ll>( - cx: &'ll SimpleCx<'_>, + fn new<'ll, 'tcx>( + cx: &CodegenCx<'ll, 'tcx>, num_args: u64, memtransfer_types: &'ll Value, geps: [&'ll Value; 3], @@ -171,7 +230,8 @@ impl KernelArgsTy { } // Contains LLVM values needed to manage offloading for a single kernel. -pub(crate) struct OffloadKernelData<'ll> { +#[derive(Copy, Clone)] +pub(crate) struct OffloadKernelGlobals<'ll> { pub offload_sizes: &'ll llvm::Value, pub memtransfer_types: &'ll llvm::Value, pub region_id: &'ll llvm::Value, @@ -179,7 +239,7 @@ pub(crate) struct OffloadKernelData<'ll> { } fn gen_tgt_data_mappers<'ll>( - cx: &'ll SimpleCx<'_>, + cx: &CodegenCx<'ll, '_>, ) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Type) { let tptr = cx.type_ptr(); let ti64 = cx.type_i64(); @@ -241,12 +301,18 @@ pub(crate) fn add_global<'ll>( // mapped to/from the gpu. It also returns a region_id with the name of this kernel, to be // concatenated into the list of region_ids. pub(crate) fn gen_define_handling<'ll>( - cx: &SimpleCx<'ll>, - offload_entry_ty: &'ll llvm::Type, + cx: &CodegenCx<'ll, '_>, metadata: &[OffloadMetadata], - types: &[&Type], - symbol: &str, -) -> OffloadKernelData<'ll> { + types: &[&'ll Type], + symbol: String, + offload_globals: &OffloadGlobals<'ll>, +) -> OffloadKernelGlobals<'ll> { + if let Some(entry) = cx.offload_kernel_cache.borrow().get(&symbol) { + return *entry; + } + + let offload_entry_ty = offload_globals.offload_entry_ty; + // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or // reference) types. let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) { @@ -274,7 +340,7 @@ pub(crate) fn gen_define_handling<'ll>( let initializer = cx.get_const_i8(0); let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage); - let c_entry_name = CString::new(symbol).unwrap(); + let c_entry_name = CString::new(symbol.clone()).unwrap(); let c_val = c_entry_name.as_bytes_with_nul(); let offload_entry_name = format!(".offloading.entry_name.{symbol}"); @@ -298,11 +364,16 @@ pub(crate) fn gen_define_handling<'ll>( let c_section_name = CString::new("llvm_offload_entries").unwrap(); llvm::set_section(offload_entry, &c_section_name); - OffloadKernelData { offload_sizes, memtransfer_types, region_id, offload_entry } + let result = + OffloadKernelGlobals { offload_sizes, memtransfer_types, region_id, offload_entry }; + + cx.offload_kernel_cache.borrow_mut().insert(symbol, result); + + result } fn declare_offload_fn<'ll>( - cx: &'ll SimpleCx<'_>, + cx: &CodegenCx<'ll, '_>, name: &str, ty: &'ll llvm::Type, ) -> &'ll llvm::Value { @@ -335,28 +406,28 @@ fn declare_offload_fn<'ll>( // 4. set insert point after kernel call. // 5. generate all the GEPS and stores, to be used in 6) // 6. generate __tgt_target_data_end calls to move data from the GPU -pub(crate) fn gen_call_handling<'ll>( - cx: &SimpleCx<'ll>, - bb: &BasicBlock, - offload_data: &OffloadKernelData<'ll>, +pub(crate) fn gen_call_handling<'ll, 'tcx>( + builder: &mut Builder<'_, 'll, 'tcx>, + offload_data: &OffloadKernelGlobals<'ll>, args: &[&'ll Value], types: &[&Type], metadata: &[OffloadMetadata], + offload_globals: &OffloadGlobals<'ll>, ) { - let OffloadKernelData { offload_sizes, offload_entry, memtransfer_types, region_id } = + let cx = builder.cx; + let OffloadKernelGlobals { offload_sizes, offload_entry, memtransfer_types, region_id } = offload_data; - let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx); - // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } - let tptr = cx.type_ptr(); - let ti32 = cx.type_i32(); - let tgt_bin_desc_ty = vec![ti32, tptr, tptr, tptr]; - let tgt_bin_desc = cx.type_named_struct("struct.__tgt_bin_desc"); - cx.set_struct_body(tgt_bin_desc, &tgt_bin_desc_ty, false); - let tgt_kernel_decl = KernelArgsTy::new_decl(&cx); - let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx); + let tgt_decl = offload_globals.launcher_fn; + let tgt_target_kernel_ty = offload_globals.launcher_ty; - let mut builder = SBuilder::build(cx, bb); + // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } + let tgt_bin_desc = offload_globals.bin_desc; + + let tgt_kernel_decl = offload_globals.kernel_args_ty; + let begin_mapper_decl = offload_globals.begin_mapper; + let end_mapper_decl = offload_globals.end_mapper; + let fn_ty = offload_globals.mapper_fn_ty; let num_args = types.len() as u64; let ip = unsafe { llvm::LLVMRustGetInsertPoint(&builder.llbuilder) }; @@ -378,9 +449,8 @@ pub(crate) fn gen_call_handling<'ll>( // Step 0) // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } // %6 = alloca %struct.__tgt_bin_desc, align 8 - let llfn = unsafe { llvm::LLVMGetBasicBlockParent(bb) }; unsafe { - llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, llfn); + llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, builder.llfn()); } let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc"); @@ -413,16 +483,16 @@ pub(crate) fn gen_call_handling<'ll>( } let mapper_fn_ty = cx.type_func(&[cx.type_ptr()], cx.type_void()); - let register_lib_decl = declare_offload_fn(&cx, "__tgt_register_lib", mapper_fn_ty); - let unregister_lib_decl = declare_offload_fn(&cx, "__tgt_unregister_lib", mapper_fn_ty); + let register_lib_decl = offload_globals.register_lib; + let unregister_lib_decl = offload_globals.unregister_lib; let init_ty = cx.type_func(&[], cx.type_void()); - let init_rtls_decl = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty); + let init_rtls_decl = offload_globals.init_rtls; // FIXME(offload): Later we want to add them to the wrapper code, rather than our main function. // call void @__tgt_register_lib(ptr noundef %6) - builder.call(mapper_fn_ty, register_lib_decl, &[tgt_bin_desc_alloca], None); + builder.call(mapper_fn_ty, None, None, register_lib_decl, &[tgt_bin_desc_alloca], None, None); // call void @__tgt_init_all_rtls() - builder.call(init_ty, init_rtls_decl, &[], None); + builder.call(init_ty, None, None, init_rtls_decl, &[], None, None); for i in 0..num_args { let idx = cx.get_const_i32(i); @@ -437,15 +507,15 @@ pub(crate) fn gen_call_handling<'ll>( // For now we have a very simplistic indexing scheme into our // offload_{baseptrs,ptrs,sizes}. We will probably improve this along with our gpu frontend pr. - fn get_geps<'a, 'll>( - builder: &mut SBuilder<'a, 'll>, - cx: &'ll SimpleCx<'ll>, + fn get_geps<'ll, 'tcx>( + builder: &mut Builder<'_, 'll, 'tcx>, ty: &'ll Type, ty2: &'ll Type, a1: &'ll Value, a2: &'ll Value, a4: &'ll Value, ) -> [&'ll Value; 3] { + let cx = builder.cx; let i32_0 = cx.get_const_i32(0); let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]); @@ -454,9 +524,8 @@ pub(crate) fn gen_call_handling<'ll>( [gep1, gep2, gep3] } - fn generate_mapper_call<'a, 'll>( - builder: &mut SBuilder<'a, 'll>, - cx: &'ll SimpleCx<'ll>, + fn generate_mapper_call<'ll, 'tcx>( + builder: &mut Builder<'_, 'll, 'tcx>, geps: [&'ll Value; 3], o_type: &'ll Value, fn_to_call: &'ll Value, @@ -464,20 +533,20 @@ pub(crate) fn gen_call_handling<'ll>( num_args: u64, s_ident_t: &'ll Value, ) { + let cx = builder.cx; let nullptr = cx.const_null(cx.type_ptr()); let i64_max = cx.get_const_i64(u64::MAX); let num_args = cx.get_const_i32(num_args); let args = vec![s_ident_t, i64_max, num_args, geps[0], geps[1], geps[2], o_type, nullptr, nullptr]; - builder.call(fn_ty, fn_to_call, &args, None); + builder.call(fn_ty, None, None, fn_to_call, &args, None, None); } // Step 2) - let s_ident_t = generate_at_one(&cx); - let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4); + let s_ident_t = offload_globals.ident_t_global; + let geps = get_geps(builder, ty, ty2, a1, a2, a4); generate_mapper_call( - &mut builder, - &cx, + builder, geps, memtransfer_types, begin_mapper_decl, @@ -504,14 +573,13 @@ pub(crate) fn gen_call_handling<'ll>( region_id, a5, ]; - builder.call(tgt_target_kernel_ty, tgt_decl, &args, None); + builder.call(tgt_target_kernel_ty, None, None, tgt_decl, &args, None, None); // %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args) // Step 4) - let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4); + let geps = get_geps(builder, ty, ty2, a1, a2, a4); generate_mapper_call( - &mut builder, - &cx, + builder, geps, memtransfer_types, end_mapper_decl, @@ -520,7 +588,5 @@ pub(crate) fn gen_call_handling<'ll>( s_ident_t, ); - builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None); - - drop(builder); + builder.call(mapper_fn_ty, None, None, unregister_lib_decl, &[tgt_bin_desc_alloca], None, None); } diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs index 6caf60e3cc41e..03da70d48f7d6 100644 --- a/compiler/rustc_codegen_llvm/src/context.rs +++ b/compiler/rustc_codegen_llvm/src/context.rs @@ -35,6 +35,7 @@ use smallvec::SmallVec; use crate::abi::to_llvm_calling_convention; use crate::back::write::to_llvm_code_model; +use crate::builder::gpu_offload::{OffloadGlobals, OffloadKernelGlobals}; use crate::callee::get_fn; use crate::debuginfo::metadata::apply_vcall_visibility_metadata; use crate::llvm::{self, Metadata, MetadataKindId, Module, Type, Value}; @@ -156,6 +157,12 @@ pub(crate) struct FullCx<'ll, 'tcx> { /// Cache of Objective-C selector references pub objc_selrefs: RefCell>, + + /// Globals shared by the offloading runtime + pub offload_globals: RefCell>>, + + /// Cache of kernel-specific globals + pub offload_kernel_cache: RefCell>>, } fn to_llvm_tls_model(tls_model: TlsModel) -> llvm::ThreadLocalMode { @@ -639,6 +646,8 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> { objc_class_t: Cell::new(None), objc_classrefs: Default::default(), objc_selrefs: Default::default(), + offload_globals: Default::default(), + offload_kernel_cache: Default::default(), }, PhantomData, ) diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 33541f7b695f8..3f5a080e7776c 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -26,7 +26,7 @@ use tracing::debug; use crate::abi::FnAbiLlvmExt; use crate::builder::Builder; use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call}; -use crate::builder::gpu_offload::TgtOffloadEntry; +use crate::builder::gpu_offload::{gen_call_handling, gen_define_handling}; use crate::context::CodegenCx; use crate::errors::{ AutoDiffWithoutEnable, AutoDiffWithoutLto, OffloadWithoutEnable, OffloadWithoutFatLTO, @@ -1287,8 +1287,6 @@ fn codegen_offload<'ll, 'tcx>( let args = get_args_from_tuple(bx, args[1], fn_target); let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target, LOCAL_CRATE); - let offload_entry_ty = TgtOffloadEntry::new_decl(&cx); - let sig = tcx.fn_sig(fn_target.def_id()).skip_binder().skip_binder(); let inputs = sig.inputs(); @@ -1296,17 +1294,11 @@ fn codegen_offload<'ll, 'tcx>( let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::>(); - let offload_data = crate::builder::gpu_offload::gen_define_handling( - cx, - offload_entry_ty, - &metadata, - &types, - &target_symbol, - ); + let offload_globals_ref = cx.offload_globals.borrow(); + let offload_globals = offload_globals_ref.as_ref().unwrap(); - // FIXME(Sa4dUs): pass the original builder once we separate kernel launch logic from globals - let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) }; - crate::builder::gpu_offload::gen_call_handling(cx, bb, &offload_data, &args, &types, &metadata); + let offload_data = gen_define_handling(&cx, &metadata, &types, target_symbol, offload_globals); + gen_call_handling(bx, &offload_data, &args, &types, &metadata, offload_globals); } fn get_args_from_tuple<'ll, 'tcx>( diff --git a/compiler/rustc_feature/src/builtin_attrs.rs b/compiler/rustc_feature/src/builtin_attrs.rs index 6eefb2f48d127..c8245b0e43bd5 100644 --- a/compiler/rustc_feature/src/builtin_attrs.rs +++ b/compiler/rustc_feature/src/builtin_attrs.rs @@ -1118,7 +1118,7 @@ pub static BUILTIN_ATTRIBUTES: &[BuiltinAttribute] = &[ template!(Word, List: &[r#""...""#]), DuplicatesOk, EncodeCrossCrate::Yes, ), - rustc_attr!( + rustc_attr!( rustc_offload_kernel, Normal, template!(Word), DuplicatesOk, EncodeCrossCrate::Yes, diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index b0f83c825705f..06bb2d92d8ba2 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -11,6 +11,7 @@ // when inside of a function called main. This, too, is a temporary workaround for not having a // frontend. +#![feature(rustc_attrs)] #![feature(core_intrinsics)] #![no_main] @@ -21,29 +22,31 @@ fn main() { core::hint::black_box(&x); } -// CHECK: %struct.__tgt_offload_entry = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr } // CHECK: %struct.ident_t = type { i32, i32, i32, i32, ptr } +// CHECK: %struct.__tgt_offload_entry = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr } // CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } // CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 } +// CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8 + // CHECK: @.offload_sizes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 1024] // CHECK: @.offload_maptypes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 35] // CHECK: @._kernel_1.region_id = internal unnamed_addr constant i8 0 // CHECK: @.offloading.entry_name._kernel_1 = internal unnamed_addr constant [10 x i8] c"_kernel_1\00", section ".llvm.rodata.offloading", align 1 // CHECK: @.offloading.entry._kernel_1 = internal constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 -// CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 -// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8 +// CHECK: Function Attrs: nounwind +// CHECK: declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) -// CHECK: Function Attrs: -// CHECK-NEXT: define{{( dso_local)?}} void @main() +// CHECK: define{{( dso_local)?}} void @main() // CHECK-NEXT: start: // CHECK-NEXT: %0 = alloca [8 x i8], align 8 // CHECK-NEXT: %x = alloca [1024 x i8], align 16 // CHECK: call void @kernel_1(ptr noalias noundef nonnull align 4 dereferenceable(1024) %x) // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %0) // CHECK-NEXT: store ptr %x, ptr %0, align 8 -// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #4, !srcloc !4 +// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) // CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %0) // CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr nonnull %x) // CHECK-NEXT: ret void @@ -92,9 +95,6 @@ fn main() { // CHECK-NEXT: ret void // CHECK-NEXT: } -// CHECK: Function Attrs: nounwind -// CHECK: declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) - #[unsafe(no_mangle)] #[inline(never)] pub fn kernel_1(x: &mut [f32; 256]) { From b357e0cf50ba7d7d1cc9d395ae14f4ee0d5df1bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcelo=20Dom=C3=ADnguez?= Date: Mon, 8 Dec 2025 22:47:05 +0100 Subject: [PATCH 2/2] Remove outdated comment --- compiler/rustc_codegen_llvm/src/back/write.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs index c0c01b80372f3..c6a1440c86dac 100644 --- a/compiler/rustc_codegen_llvm/src/back/write.rs +++ b/compiler/rustc_codegen_llvm/src/back/write.rs @@ -710,8 +710,7 @@ pub(crate) unsafe fn llvm_optimize( if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) { let cx = SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size); - // For now we only support up to 10 kernels named kernel_0 ... kernel_9, a follow-up PR is - // introducing a proper offload intrinsic to solve this limitation. + for func in cx.get_functions() { let offload_kernel = "offload-kernel"; if attributes::has_string_attr(func, offload_kernel) {