Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions compiler/rustc_codegen_llvm/src/back/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -773,11 +773,36 @@ pub(crate) unsafe fn llvm_optimize(
};

if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
let lib_bc_c = CString::new("/p/lustre1/drehwald1/prog/offload/r/lib.bc").unwrap();
let host_out_c = CString::new("/p/lustre1/drehwald1/prog/offload/r/host.out").unwrap();
let out_obj_c = CString::new("/p/lustre1/drehwald1/prog/offload/r/host.o").unwrap();

unsafe {
llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw());
llvm::LLVMRustBundleImages(
module.module_llvm.llmod(),
module.module_llvm.tm.raw(),
host_out_c.as_ptr(),
);
}
}
unsafe {
// 1) Bundle device module into offload image host.out (device TM)
let ok = llvm::LLVMRustBundleImages(
module.module_llvm.llmod(),
module.module_llvm.tm.raw(),
host_out_c.as_ptr(),
);
assert!(ok, "LLVMRustBundleImages (device -> host.out) failed");

// 2) Finalize host: lib.bc + host.out -> host.offload.o (host TM created in C++)
let ok = llvm::LLVMRustFinalizeOffload(
lib_bc_c.as_ptr(),
host_out_c.as_ptr(),
out_obj_c.as_ptr(),
);
assert!(ok, "LLVMRustFinalizeOffload (host finalize) failed");
}
dbg!("done");
}
result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
}

Expand Down
17 changes: 15 additions & 2 deletions compiler/rustc_codegen_llvm/src/llvm/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1726,7 +1726,16 @@ mod Offload {
use super::*;
unsafe extern "C" {
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool;
pub(crate) fn LLVMRustBundleImages<'a>(
M: &'a Module,
TM: &'a TargetMachine,
host_out: *const c_char,
) -> bool;
pub(crate) fn LLVMRustFinalizeOffload(
lib_bc_path: *const c_char,
host_out_path: *const c_char,
out_obj_path: *const c_char,
) -> bool;
pub(crate) fn LLVMRustOffloadMapper<'a>(OldFn: &'a Value, NewFn: &'a Value);
}
}
Expand All @@ -1740,7 +1749,11 @@ mod Offload_fallback {
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
/// Marked as unsafe to match the real offload wrapper which is unsafe due to FFI.
#[allow(unused_unsafe)]
pub(crate) unsafe fn LLVMRustBundleImages<'a>(_M: &'a Module, _TM: &'a TargetMachine) -> bool {
pub(crate) unsafe fn LLVMRustBundleImages<'a>(
_M: &'a Module,
_TM: &'a TargetMachine,
_host_out: *const c_char,
) -> bool {
unimplemented!("This rustc version was not built with LLVM Offload support!");
}
#[allow(unused_unsafe)]
Expand Down
165 changes: 160 additions & 5 deletions compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@
#ifdef OFFLOAD
#include "llvm/Object/OffloadBinary.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/MD5.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/xxhash.h"
#endif

// for raw `write` in the bad-alloc handler
Expand Down Expand Up @@ -174,12 +186,12 @@ static Error writeFile(StringRef Filename, StringRef Data) {
// --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp
// The input module is the rust code compiled for a gpu target like amdgpu.
// Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM, const char *HostOutPath) {
std::string Storage;
llvm::raw_string_ostream OS1(Storage);
llvm::WriteBitcodeToFile(*unwrap(M), OS1);
OS1.flush();
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc");
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "device.bc");

SmallVector<char, 1024> BinaryData;
raw_svector_ostream OS2(BinaryData);
Expand All @@ -188,19 +200,103 @@ extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
ImageBinary.TheImageKind = object::IMG_Bitcode;
ImageBinary.Image = std::move(MB);
ImageBinary.TheOffloadKind = object::OFK_OpenMP;
ImageBinary.StringData["triple"] = TM.getTargetTriple().str();
ImageBinary.StringData["arch"] = TM.getTargetCPU();


std::string TripleStr = TM.getTargetTriple().str();
llvm::StringRef CPURef = TM.getTargetCPU();
ImageBinary.StringData["triple"] = TripleStr;
ImageBinary.StringData["arch"] = CPURef;
llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
if (Buffer.size() % OffloadBinary::getAlignment() != 0)
// Offload binary has invalid size alignment
return false;
OS2 << Buffer;
if (Error E = writeFile("host.out",
if (Error E = writeFile(HostOutPath,
StringRef(BinaryData.begin(), BinaryData.size())))
return false;
return true;
}

#include "llvm/Bitcode/BitcodeReader.h"
Expected<std::unique_ptr<Module>>
loadHostModuleFromBitcode(LLVMContext &Ctx, StringRef LibBCPath) {
auto MBOrErr = MemoryBuffer::getFile(LibBCPath);
if (!MBOrErr)
return errorCodeToError(MBOrErr.getError());

MemoryBufferRef Ref = (*MBOrErr)->getMemBufferRef();
return parseBitcodeFile(Ref, Ctx);
}

extern "C" void embedBufferInModule(Module &M, MemoryBufferRef Buf) {
StringRef SectionName = ".llvm.offloading";
Align Alignment = Align(8);
// Embed the memory buffer into the module.
Constant *ModuleConstant = ConstantDataArray::get(
M.getContext(), ArrayRef(Buf.getBufferStart(), Buf.getBufferSize()));
GlobalVariable *GV = new GlobalVariable(
M, ModuleConstant->getType(), true, GlobalValue::PrivateLinkage,
ModuleConstant, "llvm.embedded.object");
GV->setSection(SectionName);
GV->setAlignment(Alignment);

LLVMContext &Ctx = M.getContext();
NamedMDNode *MD = M.getOrInsertNamedMetadata("llvm.embedded.objects");
Metadata *MDVals[] = {ConstantAsMetadata::get(GV),
MDString::get(Ctx, SectionName)};

MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
GV->setMetadata(LLVMContext::MD_exclude, llvm::MDNode::get(Ctx, {}));

appendToCompilerUsed(M, GV);
}

Error embedHostOutIntoHostModule(Module &HostM, StringRef HostOutPath) {
llvm::errs() << "embedHostOutIntoHostModule step 1:\n";
auto MBOrErr = MemoryBuffer::getFile(HostOutPath);
llvm::errs() << "embedHostOutIntoHostModule step 2:\n";
if (!MBOrErr)
return errorCodeToError(MBOrErr.getError());

llvm::errs() << "embedHostOutIntoHostModule step 3:\n";
MemoryBufferRef Buf = (*MBOrErr)->getMemBufferRef();
llvm::errs() << "embedHostOutIntoHostModule step 4:\n";
embedBufferInModule(HostM, Buf);
return Error::success();
}

#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/IR/LegacyPassManager.h"
//#include "llvm/Support/Host.h"
//#include "llvm/Support/TargetRegistry.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/CodeGen.h" // <-- new

Error emitHostObjectWithTM(Module &HostM,
TargetMachine &TM,
StringRef OutObjPath) {
// Make sure module matches the TM
//HostM.setDataLayout(TM.createDataLayout());
//HostM.setTargetTriple(TM.getTargetTriple().str());

legacy::PassManager PM;
std::error_code EC;
raw_fd_ostream OS(OutObjPath, EC, sys::fs::OF_None);
if (EC)
return errorCodeToError(EC);

if (TM.addPassesToEmitFile(PM, OS, nullptr, llvm::CodeGenFileType::ObjectFile))
return createStringError(inconvertibleErrorCode(),
"TargetMachine can't emit a file of this type");

PM.run(HostM);
return Error::success();
}

extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
llvm::Function *oldFn = llvm::unwrap<llvm::Function>(OldFn);
llvm::Function *newFn = llvm::unwrap<llvm::Function>(NewFn);
Expand All @@ -222,6 +318,65 @@ extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
}
#endif

// Create a host TargetMachine with HARDCODED triple/CPU
static std::unique_ptr<TargetMachine> createHostTargetMachine() {
static bool Initialized = false;
if (!Initialized) {
InitializeAllTargets();
InitializeAllTargetMCs();
InitializeAllAsmPrinters();
InitializeAllAsmParsers();
Initialized = true;
}

// Hardcoded host triple + CPU (adapt if your CI/host differs)
std::string TripleStr = "x86_64-unknown-linux-gnu";
std::string CPU = "x86-64"; // OK for X86

std::string Err;
const Target *T = TargetRegistry::lookupTarget(TripleStr, Err);
if (!T) {
// Could log Err here
return nullptr;
}

TargetOptions Opts;
auto RM = std::optional<Reloc::Model>(Reloc::PIC_);

std::unique_ptr<TargetMachine> TM(
T->createTargetMachine(TripleStr, CPU, /*Features*/"", Opts, RM));

return TM;
}

// Top-level entry: host finalize in second rustc invocation
// lib.bc (from first rustc) + host.out (from LLVMRustBundleImages) => host.offload.o
extern "C" bool LLVMRustFinalizeOffload(const char *LibBCPath,
const char *HostOutPath,
const char *OutObjPath) {
LLVMContext Ctx;

// 1. Load host lib.bc
auto ModOrErr = loadHostModuleFromBitcode(Ctx, LibBCPath);
if (!ModOrErr)
return !errorToBool(ModOrErr.takeError());
std::unique_ptr<Module> HostM = std::move(*ModOrErr);

// 2. Embed host.out
if (Error E = embedHostOutIntoHostModule(*HostM, HostOutPath))
return !errorToBool(std::move(E));

// 3. Create host TM and emit host object
auto HostTM = createHostTargetMachine();
if (!HostTM)
return false;

if (Error E = emitHostObjectWithTM(*HostM, *HostTM, OutObjPath))
return !errorToBool(std::move(E));

return true;
}

extern "C" LLVMValueRef LLVMRustGetNamedValue(LLVMModuleRef M, const char *Name,
size_t NameLen) {
return wrap(unwrap(M)->getNamedValue(StringRef(Name, NameLen)));
Expand Down
19 changes: 8 additions & 11 deletions src/doc/rustc-dev-guide/src/offload/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,28 +68,25 @@ pub extern "gpu-kernel" fn kernel_1(x: *mut [f64; 256]) {
## Compile instructions
It is important to use a clang compiler build on the same llvm as rustc. Just calling clang without the full path will likely use your system clang, which probably will be incompatible. So either substitute clang/lld invocations below with absolute path, or set your `PATH` accordingly.

First we generate the host (cpu) code. The first build is just to compile libc, take note of the hashed path. Then we call rustc directly to build our host code, while providing the libc artifact to rustc.
First we generate the host (cpu) code.
```
cargo +offload build -r -v
rustc +offload --edition 2024 src/lib.rs -g --crate-type cdylib -C opt-level=3 -C panic=abort -C lto=fat -L dependency=/absolute_path_to/target/release/deps --extern libc=/absolute_path_to/target/release/deps/liblibc-<HASH>.rlib --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options
RUSTFLAGS="--emit=llvm-bc -Zoffload=Enable -Zunstable-options" cargo +offload build -r
```
You might afterwards need to copy your target/release/deps/<lib_name>.bc to lib.bc for now, before the next step.

Now we generate the device code. Replace the target-cpu with the right code for your gpu.
```
RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core
RUSTFLAGS="-Ctarget-cpu=gfx90a -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r --target amdgcn-amd-amdhsa -Zbuild-std=core
```

This call also does a lot of work and generates multiple intermediate files for llvm offload.
While we integrated most offload steps into rustc by now, one binary invocation still remains for now:

```
"clang-21" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-S" "-save-temps=cwd" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "lib.rs" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-resource-dir" "/<ABSOLUTE_PATH_TO>/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21" "-ferror-limit" "19" "-fopenmp" "-fopenmp-offload-mandatory" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-fembed-offload-object=host.out" "-fopenmp-targets=amdgcn-amd-amdhsa" "-faddrsig" "-D__GCC_HAVE_DWARF2_CFI_ASM=1" "-o" "host.s" "-x" "ir" "lib.bc"

"clang-21" "-cc1as" "-triple" "x86_64-unknown-linux-gnu" "-filetype" "obj" "-main-file-name" "lib.rs" "-target-cpu" "x86-64" "-mrelocation-model" "pic" "-o" "host.o" "host.s"

"clang-linker-wrapper" "--should-extract=gfx90a" "--device-compiler=amdgcn-amd-amdhsa=-g" "--device-compiler=amdgcn-amd-amdhsa=-save-temps=cwd" "--device-linker=amdgcn-amd-amdhsa=-lompdevice" "--host-triple=x86_64-unknown-linux-gnu" "--save-temps" "--linker-path=/ABSOlUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/lld/bin/ld.lld" "--hash-style=gnu" "--eh-frame-hdr" "-m" "elf_x86_64" "-pie" "-dynamic-linker" "/lib64/ld-linux-x86-64.so.2" "-o" "bare" "/lib/../lib64/Scrt1.o" "/lib/../lib64/crti.o" "/ABSOLUTE_PATH_TO/crtbeginS.o" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/bin/../lib/x86_64-unknown-linux-gnu" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21/lib/x86_64-unknown-linux-gnu" "-L/lib/../lib64" "-L/usr/lib64" "-L/lib" "-L/usr/lib" "host.o" "-lstdc++" "-lm" "-lomp" "-lomptarget" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib" "-lgcc_s" "-lgcc" "-lpthread" "-lc" "-lgcc_s" "-lgcc" "/ABSOLUTE_PATH_TO/crtendS.o" "/lib/../lib64/crtn.o"
```

Especially for the last three commands I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
You can ignore other steps, e.g. the invocation of a "clang-offload-packager".
You can try to find the paths to those files on your system. However, I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
It will show multiple steps, just look for the clang-linker-wrapper example. Make sure to still include the `host.o` file, and not whatever tmp file you got when compiling your c++ example with the following call.
```
myclang++ -fuse-ld=lld -O3 -fopenmp -fopenmp-offload-mandatory --offload-arch=gfx90a omp_bare.cpp -o main -###
```
Expand Down
Loading