diff --git a/compiler/rustc_codegen_llvm/src/attributes.rs b/compiler/rustc_codegen_llvm/src/attributes.rs index 1f59d250e08a0..af5c61cb79dd8 100644 --- a/compiler/rustc_codegen_llvm/src/attributes.rs +++ b/compiler/rustc_codegen_llvm/src/attributes.rs @@ -422,7 +422,7 @@ pub(crate) fn llfn_attrs_from_instance<'ll, 'tcx>( to_add.push(uwtable_attr(cx.llcx, sess.opts.unstable_opts.use_sync_unwind)); } - if sess.opts.unstable_opts.profile_sample_use.is_some() { + if sess.opts.cg.profile_sample_use.is_some() { to_add.push(llvm::CreateAttrString(cx.llcx, "use-sample-profile")); } diff --git a/compiler/rustc_codegen_ssa/src/back/linker.rs b/compiler/rustc_codegen_ssa/src/back/linker.rs index eb908e19be54e..44e0376d225e5 100644 --- a/compiler/rustc_codegen_ssa/src/back/linker.rs +++ b/compiler/rustc_codegen_ssa/src/back/linker.rs @@ -424,7 +424,7 @@ impl<'a> GccLinker<'a> { config::OptLevel::Aggressive => "O3", }; - if let Some(path) = &self.sess.opts.unstable_opts.profile_sample_use { + if let Some(path) = &self.sess.opts.cg.profile_sample_use { self.link_arg(&format!("-plugin-opt=sample-profile={}", path.display())); }; let prefix = if self.codegen_backend == "gcc" { diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs index ff91a08de4de6..d1dd9dfb5814c 100644 --- a/compiler/rustc_codegen_ssa/src/back/write.rs +++ b/compiler/rustc_codegen_ssa/src/back/write.rs @@ -169,8 +169,8 @@ impl ModuleConfig { SwitchWithOptPath::Disabled ), pgo_use: if_regular!(sess.opts.cg.profile_use.clone(), None), - pgo_sample_use: if_regular!(sess.opts.unstable_opts.profile_sample_use.clone(), None), - debug_info_for_profiling: sess.opts.unstable_opts.debug_info_for_profiling, + pgo_sample_use: if_regular!(sess.opts.cg.profile_sample_use.clone(), None), + debug_info_for_profiling: sess.opts.cg.debuginfo_for_profiling, instrument_coverage: if_regular!(sess.instrument_coverage(), false), sanitizer: if_regular!(sess.sanitizers(), SanitizerSet::empty()), diff --git a/compiler/rustc_interface/src/passes.rs b/compiler/rustc_interface/src/passes.rs index eba38cf24b346..e2b979c6498aa 100644 --- a/compiler/rustc_interface/src/passes.rs +++ b/compiler/rustc_interface/src/passes.rs @@ -651,7 +651,7 @@ fn write_out_deps(tcx: TyCtxt<'_>, outputs: &OutputFilenames, out_filenames: &[P checksum_hash_algo, )); } - if let Some(ref profile_sample) = sess.opts.unstable_opts.profile_sample_use { + if let Some(ref profile_sample) = sess.opts.cg.profile_sample_use { files.extend(hash_iter_files( iter::once(normalize_path(profile_sample.as_path().to_path_buf())), checksum_hash_algo, diff --git a/compiler/rustc_interface/src/tests.rs b/compiler/rustc_interface/src/tests.rs index e54f68b6391e9..51488e18bf010 100644 --- a/compiler/rustc_interface/src/tests.rs +++ b/compiler/rustc_interface/src/tests.rs @@ -613,6 +613,7 @@ fn test_codegen_options_tracking_hash() { tracked!(control_flow_guard, CFGuard::Checks); tracked!(debug_assertions, Some(true)); tracked!(debuginfo, DebugInfo::Limited); + tracked!(debuginfo_for_profiling, true); tracked!(dwarf_version, Some(5)); tracked!(embed_bitcode, false); tracked!(force_frame_pointers, FramePointer::Always); @@ -634,6 +635,7 @@ fn test_codegen_options_tracking_hash() { tracked!(passes, vec![String::from("1"), String::from("2")]); tracked!(prefer_dynamic, true); tracked!(profile_generate, SwitchWithOptPath::Enabled(None)); + tracked!(profile_sample_use, Some(PathBuf::from("abc"))); tracked!(profile_use, Some(PathBuf::from("abc"))); tracked!(relocation_model, Some(RelocModel::Pic)); tracked!(relro_level, Some(RelroLevel::Full)); @@ -785,7 +787,6 @@ fn test_unstable_options_tracking_hash() { ); tracked!(crate_attr, vec!["abc".to_string()]); tracked!(cross_crate_inline_threshold, InliningThreshold::Always); - tracked!(debug_info_for_profiling, true); tracked!(debug_info_type_line_numbers, true); tracked!(default_visibility, Some(rustc_target::spec::SymbolVisibility::Hidden)); tracked!(dep_info_omit_d_target, true); @@ -849,7 +850,6 @@ fn test_unstable_options_tracking_hash() { tracked!(plt, Some(true)); tracked!(polonius, Polonius::Legacy); tracked!(precise_enum_drop_elaboration, false); - tracked!(profile_sample_use, Some(PathBuf::from("abc"))); tracked!(profiler_runtime, "abc".to_string()); tracked!(reg_struct_return, true); tracked!(regparm, Some(3)); diff --git a/compiler/rustc_session/src/config.rs b/compiler/rustc_session/src/config.rs index cd9d573957f45..5b3eaa44c547d 100644 --- a/compiler/rustc_session/src/config.rs +++ b/compiler/rustc_session/src/config.rs @@ -2513,11 +2513,11 @@ pub fn build_session_options(early_dcx: &mut EarlyDiagCtxt, matches: &getopts::M early_dcx.early_fatal("options `-C profile-generate` and `-C profile-use` are exclusive"); } - if unstable_opts.profile_sample_use.is_some() + if cg.profile_sample_use.is_some() && (cg.profile_generate.enabled() || cg.profile_use.is_some()) { early_dcx.early_fatal( - "option `-Z profile-sample-use` cannot be used with `-C profile-generate` or `-C profile-use`", + "option `-C profile-sample-use` cannot be used with `-C profile-generate` or `-C profile-use`", ); } diff --git a/compiler/rustc_session/src/options.rs b/compiler/rustc_session/src/options.rs index 2a2d46615e2e7..ca4b446b19038 100644 --- a/compiler/rustc_session/src/options.rs +++ b/compiler/rustc_session/src/options.rs @@ -2056,6 +2056,8 @@ options! { debuginfo: DebugInfo = (DebugInfo::None, parse_debuginfo, [TRACKED], "debug info emission level (0-2, none, line-directives-only, \ line-tables-only, limited, or full; default: 0)"), + debuginfo_for_profiling: bool = (false, parse_bool, [TRACKED], + "emit extra debug info to make sample profile more accurate"), default_linker_libraries: bool = (false, parse_bool, [UNTRACKED], "allow the linker to link its default libraries (default: no)"), dlltool: Option = (None, parse_opt_pathbuf, [UNTRACKED], @@ -2140,6 +2142,8 @@ options! { profile_generate: SwitchWithOptPath = (SwitchWithOptPath::Disabled, parse_switch_with_opt_path, [TRACKED], "compile the program with profiling instrumentation"), + profile_sample_use: Option = (None, parse_opt_pathbuf, [TRACKED], + "use the given `.prof` file for sample-based profile-guided optimization"), profile_use: Option = (None, parse_opt_pathbuf, [TRACKED], "use the given `.profdata` file for profile-guided optimization"), #[rustc_lint_opt_deny_field_access("use `Session::relocation_model` instead of this field")] @@ -2252,8 +2256,6 @@ options! { "inject the given attribute in the crate"), cross_crate_inline_threshold: InliningThreshold = (InliningThreshold::Sometimes(100), parse_inlining_threshold, [TRACKED], "threshold to allow cross crate inlining of functions"), - debug_info_for_profiling: bool = (false, parse_bool, [TRACKED], - "emit discriminators and other data necessary for AutoFDO"), debug_info_type_line_numbers: bool = (false, parse_bool, [TRACKED], "emit type and line information for additional data types (default: no)"), debuginfo_compression: DebugInfoCompression = (DebugInfoCompression::None, parse_debuginfo_compression, [TRACKED], @@ -2560,8 +2562,6 @@ options! { "how to run proc-macro code (default: same-thread)"), profile_closures: bool = (false, parse_no_value, [UNTRACKED], "profile size of closures"), - profile_sample_use: Option = (None, parse_opt_pathbuf, [TRACKED], - "use the given `.prof` file for sampled profile-guided optimization (also known as AutoFDO)"), profiler_runtime: String = (String::from("profiler_builtins"), parse_string, [TRACKED], "name of the profiler runtime crate to automatically inject (default: `profiler_builtins`)"), query_dep_graph: bool = (false, parse_bool, [UNTRACKED], diff --git a/compiler/rustc_session/src/session.rs b/compiler/rustc_session/src/session.rs index a9e7f1503b9ca..647cc5484332f 100644 --- a/compiler/rustc_session/src/session.rs +++ b/compiler/rustc_session/src/session.rs @@ -1150,7 +1150,7 @@ fn validate_commandline_args_with_session_available(sess: &Session) { } // Do the same for sample profile data. - if let Some(ref path) = sess.opts.unstable_opts.profile_sample_use { + if let Some(ref path) = sess.opts.cg.profile_sample_use { if !path.exists() { sess.dcx().emit_err(errors::ProfileSampleUseFileDoesNotExist { path }); } diff --git a/src/doc/rustc/src/codegen-options/index.md b/src/doc/rustc/src/codegen-options/index.md index 7af10298470ee..b51622cdf5976 100644 --- a/src/doc/rustc/src/codegen-options/index.md +++ b/src/doc/rustc/src/codegen-options/index.md @@ -91,6 +91,10 @@ following values: Note: The [`-g` flag][option-g-debug] is an alias for `-C debuginfo=2`. +## debuginfo-for-profiling + +Emit extra debug info (currently it's [DWARF descriminators](https://llvm.org/doxygen/AddDiscriminators_8cpp.html)) to make sample profile more accurate. See the chapter on [profile-guided optimization] for more information. + ## default-linker-libraries This flag controls whether or not the linker includes its default libraries. @@ -539,6 +543,13 @@ an optional argument which is the path to a directory into which the instrumented binary will emit the collected data. See the chapter on [profile-guided optimization] for more information. +## profile-sample-use + +This flag specifies the profiling data file to be used for sample-based +profile-guided optimization (SPGO). The flag takes a mandatory argument which is +the path to a valid `.prof` file. See the chapter on [profile-guided optimization] +for more information. + ## profile-use This flag specifies the profiling data file to be used for profile-guided diff --git a/src/doc/rustc/src/profile-guided-optimization.md b/src/doc/rustc/src/profile-guided-optimization.md index eeeeffe65a029..ccb2af579a819 100644 --- a/src/doc/rustc/src/profile-guided-optimization.md +++ b/src/doc/rustc/src/profile-guided-optimization.md @@ -8,27 +8,35 @@ This chapter describes what PGO is, what it is good for, and how it can be used. The basic concept of PGO is to collect data about the typical execution of a program (e.g. which branches it is likely to take) and then use this data to inform optimizations such as inlining, machine-code layout, -register allocation, etc. +register allocation, etc. Optimization levels `-Copt-level=2` and above are +recommended for use of profile guided optimization. + +`rustc` supports profile guided optimization with two different kinds of profiling. A sampling profiler can generate a profile with very low runtime overhead, or you can build an instrumented version of the code that collects more detailed profile information. Both kinds of profiles can provide execution counts for instructions in the code and information on branches taken and function invocation. There are different ways of collecting data about a program's execution. One is to run the program inside a profiler (such as `perf`) and another is to create an instrumented binary, that is, a binary that has data collection built into it, and run that. -The latter usually provides more accurate data and it is also what is -supported by `rustc`. -## Usage +## Differences Between Instrumentation and Sampling + +Although both techniques are used for similar purposes, there are important differences between the two: + +1. Profile data generated with one cannot be used by the other, and there is no conversion tool that can convert one to the other. So, a profile generated via `-Cprofile-generate` must be used with `-Cprofile-use`. Similarly, sampling profiles generated by external profilers must be converted and used with `-Cprofile-sample-use`. +2. Sampling profiles must be generated by an external tool. The profile generated by that tool must then be converted into a format that can be read by LLVM. The section on sampling profilers describes one of the supported sampling profile formats. + +## Profiling with Instrumentation Generating a PGO-optimized program involves following a workflow with four steps: 1. Compile the program with instrumentation enabled - (e.g. `rustc -Cprofile-generate=/tmp/pgo-data main.rs`) + (e.g. `rustc -Cprofile-generate=/tmp/pgo-data -O main.rs`) 2. Run the instrumented program (e.g. `./main`) which generates a `default_.profraw` file 3. Convert the `.profraw` file into a `.profdata` file using LLVM's `llvm-profdata` tool 4. Compile the program again, this time making use of the profiling data - (for example `rustc -Cprofile-use=merged.profdata main.rs`) + (for example `rustc -Cprofile-use=merged.profdata -O main.rs`) An instrumented program will create one or more `.profraw` files, one for each instrumented binary. E.g. an instrumented executable that loads two instrumented @@ -60,7 +68,7 @@ The `llvm-profdata` tool merges multiple `.profraw` files into a single ```bash # STEP 1: Compile the binary with instrumentation -rustc -Cprofile-generate=/tmp/pgo-data -O ./main.rs +rustc -Cprofile-generate=/tmp/pgo-data -O main.rs # STEP 2: Run the binary a few times, maybe with common sets of args. # Each run will create or update `.profraw` files in /tmp/pgo-data @@ -69,11 +77,11 @@ rustc -Cprofile-generate=/tmp/pgo-data -O ./main.rs ./main mydata3.csv # STEP 3: Merge and post-process all the `.profraw` files in /tmp/pgo-data -llvm-profdata merge -o ./merged.profdata /tmp/pgo-data +llvm-profdata merge -o merged.profdata /tmp/pgo-data # STEP 4: Use the merged `.profdata` file during optimization. All `rustc` # flags have to be the same. -rustc -Cprofile-use=./merged.profdata -O ./main.rs +rustc -Cprofile-use=./merged.profdata -O main.rs ``` ### A Complete Cargo Workflow @@ -136,11 +144,77 @@ RUSTFLAGS="-Cprofile-use=/tmp/pgo-data/merged.profdata" \ Cargo prior to version 1.39 that will prevent PGO from working correctly. Be sure to use Cargo 1.39 or newer when doing PGO. +## Profiling with Sampling + +Sampling profilers are used to collect runtime information, such as hardware counters, while your application executes. They are typically very efficient and do not incur a large runtime overhead. The sample data collected by the profiler can be used during compilation to determine what the most executed areas of the code are. + +Using the data from a sample profiler requires some changes in the way a program is built. Before the compiler can use profiling information, the code needs to execute under the profiler. The following is the usual build cycle when using sample profilers for optimization: + +1. Build the code with source line table information. You can use all the usual build flags that you always build your application with. The only requirement is that DWARF debug info including source line information is generated. This DWARF information is important for the profiler to be able to map instructions back to source line locations. The usefulness of this DWARF information can be improved with the `-Cdebuginfo-for-profiling` option. For example: + +```bash +rustc -Cdebuginfo=line-tables-only -Cdebuginfo-for-profiling -O main.rs +``` + +2. Run the executable under a sampling profiler. The specific profiler you use does not really matter, as long as its output can be converted into the format that the LLVM optimizer understands. + +Two such profilers are the Linux Perf [profiler](https://perf.wiki.kernel.org/) and Intel’s Sampling Enabling Product (SEP), available as part of Intel VTune. While Perf is Linux-specific, SEP can be used on Linux, Windows, and FreeBSD. + +The LLVM tool `llvm-profgen` can convert output of either Perf or SEP. An external project, [AutoFDO](https://github.com/google/autofdo), also provides a `create_llvm_prof` tool which supports Linux Perf output. + +When using Perf: + +```bash +perf record -b -e BR_INST_RETIRED.NEAR_TAKEN:uppp ./main +``` + +If the event above is unavailable, `branches:u` is probably next-best. + +Note the use of the `-b` flag. This tells Perf to use the Last Branch Record (LBR) to record call chains. While this is not strictly required, it provides better call information, which improves the accuracy of the profile data. + +When using SEP: + +```bash +sep -start -out code.tb7 -ec BR_INST_RETIRED.NEAR_TAKEN:precise=yes:pdir -lbr no_filter:usr -perf-script brstack -app ./main +``` + +This produces a `code.perf.data.script` output which can be used with `llvm-profgen`’s `--perfscript` input option. + +3. Convert the collected profile data to LLVM’s sample profile format. This is currently supported via the AutoFDO converter `create_llvm_prof`. Once built and installed, you can convert the `perf.data` file to LLVM using the command: + +```bash +create_llvm_prof --binary=./main --out=main.prof +``` + +This will read `perf.data` and the binary file `./main` and emit the profile data in `main.prof`. Note that if you ran `perf` without the `-b` flag, you need to use `--use_lbr=false` when calling `create_llvm_prof`. + +Alternatively, the LLVM tool `llvm-profgen` can also be used to generate the LLVM sample profile: + +```bash +llvm-profgen --binary=./main --output=main.prof --perfdata=perf.data +``` + +Please note, `perf.data` must be collected with `-b` flag to Linux perf for the above step to work. + +When using SEP the output is in the textual format corresponding to `llvm-profgen --perfscript`. For example: + +```bash +llvm-profgen --binary=./main --output=main.prof --perfscript=main.perf.data.script +``` + +4. Build the code again using the collected profile. This step feeds the profile back to the optimizers. This should result in a binary that executes faster than the original one. Note that you are not required to build the code with the exact same arguments that you used in the first step. The only requirement is that you build the code with the same debug info options and `-Cprofile-sample-use`. + +```bash +rustc -Cprofile-sample-use=main.prof -Cdebuginfo-for-profiling -O main.rs +``` + +Note that Sample-based PGO in `rustc` is mostly tested on `x86-64` Linux platforms. It should work on other hardware architectures and operating systems but it's not heavily tested yet. + ## Further Reading `rustc`'s PGO support relies entirely on LLVM's implementation of the feature and is equivalent to what Clang offers via the `-fprofile-generate` / -`-fprofile-use` flags. The [Profile Guided Optimization][clang-pgo] section +`-fprofile-use` and `-fprofile-sample-use` flags. The [Profile Guided Optimization][clang-pgo] section in Clang's documentation is therefore an interesting read for anyone who wants to use PGO with Rust. @@ -151,11 +225,11 @@ to use PGO with Rust. As an alternative to directly using the compiler for Profile-Guided Optimization, you may choose to go with `cargo-pgo`, which has an intuitive command-line API and saves you the trouble of doing all the manual work. You can read more about -it in [cargo-pgo repository][cargo-pgo]. +it in [cargo-pgo repository][cargo-pgo]. For now, `cargo-pgo` supports only Instrumentation PGO. [cargo-pgo]: https://github.com/Kobzol/cargo-pgo -For the sake of completeness, here are the corresponding steps using `cargo-pgo`: +For the sake of completeness, here are the corresponding steps using `cargo-pgo` for Instrumentation PGO: ```bash # Install if you haven't already diff --git a/src/doc/unstable-book/src/compiler-flags/debug_info_for_profiling.md b/src/doc/unstable-book/src/compiler-flags/debug_info_for_profiling.md deleted file mode 100644 index ee72b6adf8e9f..0000000000000 --- a/src/doc/unstable-book/src/compiler-flags/debug_info_for_profiling.md +++ /dev/null @@ -1,35 +0,0 @@ -# `debug-info-for-profiling` - ---- - -## Introduction - -Automatic Feedback Directed Optimization (AFDO) is a method for using sampling -based profiles to guide optimizations. This is contrasted with other methods of -FDO or profile-guided optimization (PGO) which use instrumented profiling. - -Unlike PGO (controlled by the `rustc` flags `-Cprofile-generate` and -`-Cprofile-use`), a binary being profiled does not perform significantly worse, -and thus it's possible to profile binaries used in real workflows and not -necessary to construct artificial workflows. - -## Use - -In order to use AFDO, the target platform must be Linux running on an `x86_64` -architecture with the performance profiler `perf` available. In addition, the -external tool `create_llvm_prof` from [this repository] must be used. - -Given a Rust file `main.rs`, we can produce an optimized binary as follows: - -```shell -rustc -O -Zdebug-info-for-profiling main.rs -o main -perf record -b ./main -create_llvm_prof --binary=main --out=code.prof -rustc -O -Zprofile-sample-use=code.prof main.rs -o main2 -``` - -The `perf` command produces a profile `perf.data`, which is then used by the -`create_llvm_prof` command to create `code.prof`. This final profile is then -used by `rustc` to guide optimizations in producing the binary `main2`. - -[this repository]: https://github.com/google/autofdo diff --git a/src/doc/unstable-book/src/compiler-flags/profile_sample_use.md b/src/doc/unstable-book/src/compiler-flags/profile_sample_use.md deleted file mode 100644 index 2dd1f6f8e1a3a..0000000000000 --- a/src/doc/unstable-book/src/compiler-flags/profile_sample_use.md +++ /dev/null @@ -1,10 +0,0 @@ -# `profile-sample-use` - ---- - -`-Zprofile-sample-use=code.prof` directs `rustc` to use the profile -`code.prof` as a source for Automatic Feedback Directed Optimization (AFDO). -See the documentation of [`-Zdebug-info-for-profiling`] for more information -on using AFDO. - -[`-Zdebug-info-for-profiling`]: debug_info_for_profiling.html