Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions compiler/rustc_codegen_llvm/src/back/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,13 @@ pub(crate) unsafe fn llvm_optimize(
llvm_plugins.len(),
)
};

if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
unsafe {
llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw());
}
}

result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
}

Expand Down
3 changes: 3 additions & 0 deletions compiler/rustc_codegen_llvm/src/llvm/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1641,6 +1641,9 @@ unsafe extern "C" {
Name: *const c_char,
) -> &'a Value;

/// Processes the module and writes it in an offload compatible way into a "host.out" file.
pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool;

/// Writes a module to the specified path. Returns 0 on success.
pub(crate) fn LLVMWriteBitcodeToFile(M: &Module, Path: *const c_char) -> c_int;

Expand Down
47 changes: 47 additions & 0 deletions compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Value.h"
#include "llvm/Object/COFFImportFile.h"
#include "llvm/Object/OffloadBinary.h"
#include "llvm/Remarks/RemarkFormat.h"
#include "llvm/Remarks/RemarkSerializer.h"
#include "llvm/Remarks/RemarkStreamer.h"
Expand All @@ -35,6 +36,7 @@
#include "llvm/Support/Signals.h"
#include "llvm/Support/Timer.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
Expand Down Expand Up @@ -144,6 +146,51 @@ extern "C" void LLVMRustPrintStatistics(RustStringRef OutBuf) {
llvm::PrintStatistics(OS);
}

static Error writeFile(StringRef Filename, StringRef Data) {
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
FileOutputBuffer::create(Filename, Data.size());
if (!OutputOrErr)
return OutputOrErr.takeError();
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
llvm::copy(Data, Output->getBufferStart());
if (Error E = Output->commit())
return E;
return Error::success();
}

// This is the first of many steps in creating a binary using llvm offload,
// to run code on the gpu. Concrete, it replaces the following binary use:
// clang-offload-packager -o host.out
// --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp
// The input module is the rust code compiled for a gpu target like amdgpu.
// Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
std::string Storage;
llvm::raw_string_ostream OS1(Storage);
llvm::WriteBitcodeToFile(*unwrap(M), OS1);
OS1.flush();
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc");

SmallVector<char, 1024> BinaryData;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This buffer could be an argument provided by rustc and then rustc can do the file writing

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is, that the C++ version is resizable. If we provide a buffer from rust, it wouldn't be.
I asked and there is no reasonable default size, so we'd pass a (likely) too-small buffer in, set the needed length and return false, see that in rust, allocate a larger buffer with the requested size, call the method again, and hope that it now passes.
It's just 3 lines extra on the Rust side, and I don't expect it to become a compile-time bottleneck, since no one (famous last words) will compile >10k kernels, but it still feels ugly.

raw_svector_ostream OS2(BinaryData);

OffloadBinary::OffloadingImage ImageBinary{};
ImageBinary.TheImageKind = object::IMG_Bitcode;
ImageBinary.Image = std::move(MB);
ImageBinary.TheOffloadKind = object::OFK_OpenMP;
ImageBinary.StringData["triple"] = TM.getTargetTriple().str();
ImageBinary.StringData["arch"] = TM.getTargetCPU();
llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
if (Buffer.size() % OffloadBinary::getAlignment() != 0)
// Offload binary has invalid size alignment
return false;
OS2 << Buffer;
if (Error E = writeFile("host.out",
StringRef(BinaryData.begin(), BinaryData.size())))
return false;
return true;
}

extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
llvm::Function *oldFn = llvm::unwrap<llvm::Function>(OldFn);
llvm::Function *newFn = llvm::unwrap<llvm::Function>(NewFn);
Expand Down
14 changes: 2 additions & 12 deletions src/doc/rustc-dev-guide/src/offload/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,27 +79,17 @@ Now we generate the device code. Replace the target-cpu with the right code for
RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core
```

Now find the `<libname>.ll` under target/amdgcn-amd-amdhsa folder and copy it to a device.ll file (or adjust the file names below).
If you work on an NVIDIA or Intel gpu, please adjust the names acordingly and open an issue to share your results (either if you succeed or fail).
First we compile our .ll files (good for manual inspections) to .bc files and clean up leftover artifacts. The cleanup is important, otherwise caching might interfere on following runs.
```
opt lib.ll -o lib.bc
opt device.ll -o device.bc
rm *.o
rm bare.amdgcn.gfx90a.img*
```

```
"clang-offload-packager" "-o" "host.out" "--image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"

"clang-21" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-S" "-save-temps=cwd" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "lib.rs" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-resource-dir" "/<ABSOLUTE_PATH_TO>/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21" "-ferror-limit" "19" "-fopenmp" "-fopenmp-offload-mandatory" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-fembed-offload-object=host.out" "-fopenmp-targets=amdgcn-amd-amdhsa" "-faddrsig" "-D__GCC_HAVE_DWARF2_CFI_ASM=1" "-o" "host.s" "-x" "ir" "lib.bc"

"clang-21" "-cc1as" "-triple" "x86_64-unknown-linux-gnu" "-filetype" "obj" "-main-file-name" "lib.rs" "-target-cpu" "x86-64" "-mrelocation-model" "pic" "-o" "host.o" "host.s"

"clang-linker-wrapper" "--should-extract=gfx90a" "--device-compiler=amdgcn-amd-amdhsa=-g" "--device-compiler=amdgcn-amd-amdhsa=-save-temps=cwd" "--device-linker=amdgcn-amd-amdhsa=-lompdevice" "--host-triple=x86_64-unknown-linux-gnu" "--save-temps" "--linker-path=/ABSOlUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/lld/bin/ld.lld" "--hash-style=gnu" "--eh-frame-hdr" "-m" "elf_x86_64" "-pie" "-dynamic-linker" "/lib64/ld-linux-x86-64.so.2" "-o" "bare" "/lib/../lib64/Scrt1.o" "/lib/../lib64/crti.o" "/ABSOLUTE_PATH_TO/crtbeginS.o" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/bin/../lib/x86_64-unknown-linux-gnu" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21/lib/x86_64-unknown-linux-gnu" "-L/lib/../lib64" "-L/usr/lib64" "-L/lib" "-L/usr/lib" "host.o" "-lstdc++" "-lm" "-lomp" "-lomptarget" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib" "-lgcc_s" "-lgcc" "-lpthread" "-lc" "-lgcc_s" "-lgcc" "/ABSOLUTE_PATH_TO/crtendS.o" "/lib/../lib64/crtn.o"
```

Especially for the last command I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
Especially for the last three commands I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
You can ignore other steps, e.g. the invocation of a "clang-offload-packager".
```
myclang++ -fuse-ld=lld -O3 -fopenmp -fopenmp-offload-mandatory --offload-arch=gfx90a omp_bare.cpp -o main -###
```
Expand Down
Loading