diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs index cbaf67d734547..af1dd99a8e146 100644 --- a/compiler/rustc_codegen_ssa/src/back/write.rs +++ b/compiler/rustc_codegen_ssa/src/back/write.rs @@ -15,8 +15,8 @@ use rustc_data_structures::profiling::{SelfProfilerRef, VerboseTimingGuard}; use rustc_errors::emitter::Emitter; use rustc_errors::translation::Translator; use rustc_errors::{ - Diag, DiagArgMap, DiagCtxt, DiagMessage, ErrCode, FatalErrorMarker, Level, MultiSpan, Style, - Suggestions, + Diag, DiagArgMap, DiagCtxt, DiagMessage, ErrCode, FatalError, FatalErrorMarker, Level, + MultiSpan, Style, Suggestions, }; use rustc_fs_util::link_or_copy; use rustc_incremental::{ @@ -380,7 +380,7 @@ fn generate_thin_lto_work( each_linked_rlib_for_lto: &[PathBuf], needs_thin_lto: Vec<(String, B::ThinBuffer)>, import_only_modules: Vec<(SerializedModule, WorkProduct)>, -) -> Vec<(WorkItem, u64)> { +) -> Vec<(ThinLtoWorkItem, u64)> { let _prof_timer = cgcx.prof.generic_activity("codegen_thin_generate_lto_work"); let (lto_modules, copy_jobs) = B::run_thin_lto( @@ -394,11 +394,11 @@ fn generate_thin_lto_work( .into_iter() .map(|module| { let cost = module.cost(); - (WorkItem::ThinLto(module), cost) + (ThinLtoWorkItem::ThinLto(module), cost) }) .chain(copy_jobs.into_iter().map(|wp| { ( - WorkItem::CopyPostLtoArtifacts(CachedModuleCodegen { + ThinLtoWorkItem::CopyPostLtoArtifacts(CachedModuleCodegen { name: wp.cgu_name.clone(), source: wp, }), @@ -703,64 +703,73 @@ pub(crate) enum WorkItem { /// Copy the post-LTO artifacts from the incremental cache to the output /// directory. CopyPostLtoArtifacts(CachedModuleCodegen), - /// Performs fat LTO on the given module. - FatLto { - exported_symbols_for_lto: Arc>, - each_linked_rlib_for_lto: Vec, - needs_fat_lto: Vec>, - import_only_modules: Vec<(SerializedModule, WorkProduct)>, - }, +} + +enum ThinLtoWorkItem { + /// Copy the post-LTO artifacts from the incremental cache to the output + /// directory. + CopyPostLtoArtifacts(CachedModuleCodegen), /// Performs thin-LTO on the given module. ThinLto(lto::ThinModule), } +// `pthread_setname()` on *nix ignores anything beyond the first 15 +// bytes. Use short descriptions to maximize the space available for +// the module name. +#[cfg(not(windows))] +fn desc(short: &str, _long: &str, name: &str) -> String { + // The short label is three bytes, and is followed by a space. That + // leaves 11 bytes for the CGU name. How we obtain those 11 bytes + // depends on the CGU name form. + // + // - Non-incremental, e.g. `regex.f10ba03eb5ec7975-cgu.0`: the part + // before the `-cgu.0` is the same for every CGU, so use the + // `cgu.0` part. The number suffix will be different for each + // CGU. + // + // - Incremental (normal), e.g. `2i52vvl2hco29us0`: use the whole + // name because each CGU will have a unique ASCII hash, and the + // first 11 bytes will be enough to identify it. + // + // - Incremental (with `-Zhuman-readable-cgu-names`), e.g. + // `regex.f10ba03eb5ec7975-re_builder.volatile`: use the whole + // name. The first 11 bytes won't be enough to uniquely identify + // it, but no obvious substring will, and this is a rarely used + // option so it doesn't matter much. + // + assert_eq!(short.len(), 3); + let name = if let Some(index) = name.find("-cgu.") { + &name[index + 1..] // +1 skips the leading '-'. + } else { + name + }; + format!("{short} {name}") +} + +// Windows has no thread name length limit, so use more descriptive names. +#[cfg(windows)] +fn desc(_short: &str, long: &str, name: &str) -> String { + format!("{long} {name}") +} + impl WorkItem { /// Generate a short description of this work item suitable for use as a thread name. fn short_description(&self) -> String { - // `pthread_setname()` on *nix ignores anything beyond the first 15 - // bytes. Use short descriptions to maximize the space available for - // the module name. - #[cfg(not(windows))] - fn desc(short: &str, _long: &str, name: &str) -> String { - // The short label is three bytes, and is followed by a space. That - // leaves 11 bytes for the CGU name. How we obtain those 11 bytes - // depends on the CGU name form. - // - // - Non-incremental, e.g. `regex.f10ba03eb5ec7975-cgu.0`: the part - // before the `-cgu.0` is the same for every CGU, so use the - // `cgu.0` part. The number suffix will be different for each - // CGU. - // - // - Incremental (normal), e.g. `2i52vvl2hco29us0`: use the whole - // name because each CGU will have a unique ASCII hash, and the - // first 11 bytes will be enough to identify it. - // - // - Incremental (with `-Zhuman-readable-cgu-names`), e.g. - // `regex.f10ba03eb5ec7975-re_builder.volatile`: use the whole - // name. The first 11 bytes won't be enough to uniquely identify - // it, but no obvious substring will, and this is a rarely used - // option so it doesn't matter much. - // - assert_eq!(short.len(), 3); - let name = if let Some(index) = name.find("-cgu.") { - &name[index + 1..] // +1 skips the leading '-'. - } else { - name - }; - format!("{short} {name}") - } - - // Windows has no thread name length limit, so use more descriptive names. - #[cfg(windows)] - fn desc(_short: &str, long: &str, name: &str) -> String { - format!("{long} {name}") - } - match self { WorkItem::Optimize(m) => desc("opt", "optimize module", &m.name), WorkItem::CopyPostLtoArtifacts(m) => desc("cpy", "copy LTO artifacts for", &m.name), - WorkItem::FatLto { .. } => desc("lto", "fat LTO module", "everything"), - WorkItem::ThinLto(m) => desc("lto", "thin-LTO module", m.name()), + } + } +} + +impl ThinLtoWorkItem { + /// Generate a short description of this work item suitable for use as a thread name. + fn short_description(&self) -> String { + match self { + ThinLtoWorkItem::CopyPostLtoArtifacts(m) => { + desc("cpy", "copy LTO artifacts for", &m.name) + } + ThinLtoWorkItem::ThinLto(m) => desc("lto", "thin-LTO module", m.name()), } } } @@ -891,7 +900,7 @@ fn execute_optimize_work_item( fn execute_copy_from_cache_work_item( cgcx: &CodegenContext, module: CachedModuleCodegen, -) -> WorkItemResult { +) -> CompiledModule { let _timer = cgcx .prof .generic_activity_with_arg("codegen_copy_artifacts_from_incr_cache", &*module.name); @@ -964,7 +973,7 @@ fn execute_copy_from_cache_work_item( cgcx.create_dcx().handle().emit_fatal(errors::NoSavedObjectFile { cgu_name: &module.name }) } - WorkItemResult::Finished(CompiledModule { + CompiledModule { links_from_incr_cache, kind: ModuleKind::Regular, name: module.name, @@ -973,17 +982,19 @@ fn execute_copy_from_cache_work_item( bytecode, assembly, llvm_ir, - }) + } } -fn execute_fat_lto_work_item( +fn do_fat_lto( cgcx: &CodegenContext, exported_symbols_for_lto: &[String], each_linked_rlib_for_lto: &[PathBuf], mut needs_fat_lto: Vec>, import_only_modules: Vec<(SerializedModule, WorkProduct)>, -) -> WorkItemResult { - let _timer = cgcx.prof.generic_activity_with_arg("codegen_module_perform_lto", "everything"); +) -> CompiledModule { + let _timer = cgcx.prof.verbose_generic_activity("LLVM_fatlto"); + + check_lto_allowed(&cgcx); for (module, wp) in import_only_modules { needs_fat_lto.push(FatLtoInput::Serialized { name: wp.cgu_name, buffer: module }) @@ -995,19 +1006,155 @@ fn execute_fat_lto_work_item( each_linked_rlib_for_lto, needs_fat_lto, ); - let module = B::codegen(cgcx, module, &cgcx.module_config); - WorkItemResult::Finished(module) + B::codegen(cgcx, module, &cgcx.module_config) +} + +fn do_thin_lto<'a, B: ExtraBackendMethods>( + cgcx: &'a CodegenContext, + exported_symbols_for_lto: Arc>, + each_linked_rlib_for_lto: Vec, + needs_thin_lto: Vec<(String, ::ThinBuffer)>, + lto_import_only_modules: Vec<( + SerializedModule<::ModuleBuffer>, + WorkProduct, + )>, +) -> Vec { + let _timer = cgcx.prof.verbose_generic_activity("LLVM_thinlto"); + + check_lto_allowed(&cgcx); + + let (coordinator_send, coordinator_receive) = channel(); + + // First up, convert our jobserver into a helper thread so we can use normal + // mpsc channels to manage our messages and such. + // After we've requested tokens then we'll, when we can, + // get tokens on `coordinator_receive` which will + // get managed in the main loop below. + let coordinator_send2 = coordinator_send.clone(); + let helper = jobserver::client() + .into_helper_thread(move |token| { + drop(coordinator_send2.send(ThinLtoMessage::Token(token))); + }) + .expect("failed to spawn helper thread"); + + let mut work_items = vec![]; + + // We have LTO work to do. Perform the serial work here of + // figuring out what we're going to LTO and then push a + // bunch of work items onto our queue to do LTO. This all + // happens on the coordinator thread but it's very quick so + // we don't worry about tokens. + for (work, cost) in generate_thin_lto_work( + cgcx, + &exported_symbols_for_lto, + &each_linked_rlib_for_lto, + needs_thin_lto, + lto_import_only_modules, + ) { + let insertion_index = + work_items.binary_search_by_key(&cost, |&(_, cost)| cost).unwrap_or_else(|e| e); + work_items.insert(insertion_index, (work, cost)); + if cgcx.parallel { + helper.request_token(); + } + } + + let mut codegen_aborted = None; + + // These are the Jobserver Tokens we currently hold. Does not include + // the implicit Token the compiler process owns no matter what. + let mut tokens = vec![]; + + // Amount of tokens that are used (including the implicit token). + let mut used_token_count = 0; + + let mut compiled_modules = vec![]; + + // Run the message loop while there's still anything that needs message + // processing. Note that as soon as codegen is aborted we simply want to + // wait for all existing work to finish, so many of the conditions here + // only apply if codegen hasn't been aborted as they represent pending + // work to be done. + loop { + if codegen_aborted.is_none() { + if used_token_count == 0 && work_items.is_empty() { + // All codegen work is done. + break; + } + + // Spin up what work we can, only doing this while we've got available + // parallelism slots and work left to spawn. + while used_token_count < tokens.len() + 1 + && let Some((item, _)) = work_items.pop() + { + spawn_thin_lto_work(&cgcx, coordinator_send.clone(), item); + used_token_count += 1; + } + } else { + // Don't queue up any more work if codegen was aborted, we're + // just waiting for our existing children to finish. + if used_token_count == 0 { + break; + } + } + + // Relinquish accidentally acquired extra tokens. Subtract 1 for the implicit token. + tokens.truncate(used_token_count.saturating_sub(1)); + + match coordinator_receive.recv().unwrap() { + // Save the token locally and the next turn of the loop will use + // this to spawn a new unit of work, or it may get dropped + // immediately if we have no more work to spawn. + ThinLtoMessage::Token(token) => match token { + Ok(token) => { + tokens.push(token); + } + Err(e) => { + let msg = &format!("failed to acquire jobserver token: {e}"); + cgcx.diag_emitter.fatal(msg); + codegen_aborted = Some(FatalError); + } + }, + + ThinLtoMessage::WorkItem { result } => { + // If a thread exits successfully then we drop a token associated + // with that worker and update our `used_token_count` count. + // We may later re-acquire a token to continue running more work. + // We may also not actually drop a token here if the worker was + // running with an "ephemeral token". + used_token_count -= 1; + + match result { + Ok(compiled_module) => compiled_modules.push(compiled_module), + Err(Some(WorkerFatalError)) => { + // Like `CodegenAborted`, wait for remaining work to finish. + codegen_aborted = Some(FatalError); + } + Err(None) => { + // If the thread failed that means it panicked, so + // we abort immediately. + bug!("worker thread panicked"); + } + } + } + } + } + + if let Some(codegen_aborted) = codegen_aborted { + codegen_aborted.raise(); + } + + compiled_modules } fn execute_thin_lto_work_item( cgcx: &CodegenContext, module: lto::ThinModule, -) -> WorkItemResult { +) -> CompiledModule { let _timer = cgcx.prof.generic_activity_with_arg("codegen_module_perform_lto", module.name()); let module = B::optimize_thin(cgcx, module); - let module = B::codegen(cgcx, module, &cgcx.module_config); - WorkItemResult::Finished(module) + B::codegen(cgcx, module, &cgcx.module_config) } /// Messages sent to the coordinator. @@ -1041,6 +1188,17 @@ pub(crate) enum Message { CodegenAborted, } +/// Messages sent to the coordinator. +pub(crate) enum ThinLtoMessage { + /// A jobserver token has become available. Sent from the jobserver helper + /// thread. + Token(io::Result), + + /// The backend has finished processing a work item for a codegen unit. + /// Sent from a backend worker thread. + WorkItem { result: Result> }, +} + /// A message sent from the coordinator thread to the main thread telling it to /// process another codegen unit. pub struct CguMessage; @@ -1092,9 +1250,8 @@ fn start_executing_work( regular_config: Arc, allocator_config: Arc, allocator_module: Option>, - tx_to_llvm_workers: Sender>, + coordinator_send: Sender>, ) -> thread::JoinHandle> { - let coordinator_send = tx_to_llvm_workers; let sess = tcx.sess; let mut each_linked_rlib_for_lto = Vec::new(); @@ -1314,7 +1471,6 @@ fn start_executing_work( let mut needs_fat_lto = Vec::new(); let mut needs_thin_lto = Vec::new(); let mut lto_import_only_modules = Vec::new(); - let mut started_lto = false; /// Possible state transitions: /// - Ongoing -> Completed @@ -1404,63 +1560,8 @@ fn start_executing_work( if running_with_any_token(main_thread_state, running_with_own_token) == 0 && work_items.is_empty() { - // All codegen work is done. Do we have LTO work to do? - if needs_fat_lto.is_empty() - && needs_thin_lto.is_empty() - && lto_import_only_modules.is_empty() - { - // Nothing more to do! - break; - } - - // We have LTO work to do. Perform the serial work here of - // figuring out what we're going to LTO and then push a - // bunch of work items onto our queue to do LTO. This all - // happens on the coordinator thread but it's very quick so - // we don't worry about tokens. - assert!(!started_lto); - started_lto = true; - - let needs_fat_lto = mem::take(&mut needs_fat_lto); - let needs_thin_lto = mem::take(&mut needs_thin_lto); - let import_only_modules = mem::take(&mut lto_import_only_modules); - let each_linked_rlib_file_for_lto = - mem::take(&mut each_linked_rlib_file_for_lto); - - check_lto_allowed(&cgcx); - - if !needs_fat_lto.is_empty() { - assert!(needs_thin_lto.is_empty()); - - work_items.push(( - WorkItem::FatLto { - exported_symbols_for_lto: Arc::clone(&exported_symbols_for_lto), - each_linked_rlib_for_lto: each_linked_rlib_file_for_lto, - needs_fat_lto, - import_only_modules, - }, - 0, - )); - if cgcx.parallel { - helper.request_token(); - } - } else { - for (work, cost) in generate_thin_lto_work( - &cgcx, - &exported_symbols_for_lto, - &each_linked_rlib_file_for_lto, - needs_thin_lto, - import_only_modules, - ) { - let insertion_index = work_items - .binary_search_by_key(&cost, |&(_, cost)| cost) - .unwrap_or_else(|e| e); - work_items.insert(insertion_index, (work, cost)); - if cgcx.parallel { - helper.request_token(); - } - } - } + // All codegen work is done. + break; } // In this branch, we know that everything has been codegened, @@ -1598,12 +1699,10 @@ fn start_executing_work( compiled_modules.push(compiled_module); } Ok(WorkItemResult::NeedsFatLto(fat_lto_input)) => { - assert!(!started_lto); assert!(needs_thin_lto.is_empty()); needs_fat_lto.push(fat_lto_input); } Ok(WorkItemResult::NeedsThinLto(name, thin_buffer)) => { - assert!(!started_lto); assert!(needs_fat_lto.is_empty()); needs_thin_lto.push((name, thin_buffer)); } @@ -1620,7 +1719,6 @@ fn start_executing_work( } Message::AddImportOnlyModule { module_data, work_product } => { - assert!(!started_lto); assert_eq!(codegen_state, Ongoing); assert_eq!(main_thread_state, MainThreadState::Codegenning); lto_import_only_modules.push((module_data, work_product)); @@ -1629,12 +1727,43 @@ fn start_executing_work( } } + // Drop to print timings + drop(llvm_start_time); + if codegen_state == Aborted { return Err(()); } - // Drop to print timings - drop(llvm_start_time); + drop(codegen_state); + drop(tokens); + drop(helper); + assert!(work_items.is_empty()); + + if !needs_fat_lto.is_empty() { + assert!(compiled_modules.is_empty()); + assert!(needs_thin_lto.is_empty()); + + // This uses the implicit token + let module = do_fat_lto( + &cgcx, + &exported_symbols_for_lto, + &each_linked_rlib_file_for_lto, + needs_fat_lto, + lto_import_only_modules, + ); + compiled_modules.push(module); + } else if !needs_thin_lto.is_empty() || !lto_import_only_modules.is_empty() { + assert!(compiled_modules.is_empty()); + assert!(needs_fat_lto.is_empty()); + + compiled_modules.extend(do_thin_lto( + &cgcx, + exported_symbols_for_lto, + each_linked_rlib_file_for_lto, + needs_thin_lto, + lto_import_only_modules, + )); + } // Regardless of what order these modules completed in, report them to // the backend in the same order every time to ensure that we're handing @@ -1725,20 +1854,9 @@ fn spawn_work<'a, B: ExtraBackendMethods>( B::spawn_named_thread(cgcx.time_trace, work.short_description(), move || { let result = std::panic::catch_unwind(AssertUnwindSafe(|| match work { WorkItem::Optimize(m) => execute_optimize_work_item(&cgcx, m), - WorkItem::CopyPostLtoArtifacts(m) => execute_copy_from_cache_work_item(&cgcx, m), - WorkItem::FatLto { - exported_symbols_for_lto, - each_linked_rlib_for_lto, - needs_fat_lto, - import_only_modules, - } => execute_fat_lto_work_item( - &cgcx, - &exported_symbols_for_lto, - &each_linked_rlib_for_lto, - needs_fat_lto, - import_only_modules, - ), - WorkItem::ThinLto(m) => execute_thin_lto_work_item(&cgcx, m), + WorkItem::CopyPostLtoArtifacts(m) => { + WorkItemResult::Finished(execute_copy_from_cache_work_item(&cgcx, m)) + } })); let msg = match result { @@ -1758,6 +1876,36 @@ fn spawn_work<'a, B: ExtraBackendMethods>( .expect("failed to spawn work thread"); } +fn spawn_thin_lto_work<'a, B: ExtraBackendMethods>( + cgcx: &'a CodegenContext, + coordinator_send: Sender, + work: ThinLtoWorkItem, +) { + let cgcx = cgcx.clone(); + + B::spawn_named_thread(cgcx.time_trace, work.short_description(), move || { + let result = std::panic::catch_unwind(AssertUnwindSafe(|| match work { + ThinLtoWorkItem::CopyPostLtoArtifacts(m) => execute_copy_from_cache_work_item(&cgcx, m), + ThinLtoWorkItem::ThinLto(m) => execute_thin_lto_work_item(&cgcx, m), + })); + + let msg = match result { + Ok(result) => ThinLtoMessage::WorkItem { result: Ok(result) }, + + // We ignore any `FatalError` coming out of `execute_work_item`, as a + // diagnostic was already sent off to the main thread - just surface + // that there was an error in this worker. + Err(err) if err.is::() => { + ThinLtoMessage::WorkItem { result: Err(Some(WorkerFatalError)) } + } + + Err(_) => ThinLtoMessage::WorkItem { result: Err(None) }, + }; + drop(coordinator_send.send(msg)); + }) + .expect("failed to spawn work thread"); +} + enum SharedEmitterMessage { Diagnostic(Diagnostic), InlineAsmError(SpanData, String, Level, Option<(String, Vec)>),