Skip to content

Commit

Permalink
webgpu: Implement device lost (#32354)
Browse files Browse the repository at this point in the history
* device lost promise should be init at creation of device object

* device lost impl

* lock for device poll

workaround for wgpu deadlocks

* expect

* Less lost reason reasoning in script
  • Loading branch information
sagudev committed Jun 17, 2024
1 parent 3381f2a commit cbc9304
Show file tree
Hide file tree
Showing 9 changed files with 223 additions and 5,627 deletions.
2 changes: 1 addition & 1 deletion components/script/dom/bindings/codegen/Bindings.conf
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ DOMInterfaces = {
},

'GPUDevice': {
'inRealms': ['PopErrorScope', 'GetLost', 'CreateComputePipelineAsync', 'CreateRenderPipelineAsync'],
'inRealms': ['PopErrorScope', 'CreateComputePipelineAsync', 'CreateRenderPipelineAsync'],
}

}
26 changes: 18 additions & 8 deletions components/script/dom/globalscope.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ use script_traits::{
};
use servo_url::{ImmutableOrigin, MutableOrigin, ServoUrl};
use uuid::Uuid;
use webgpu::WebGPUDevice;
use webgpu::{DeviceLostReason, WebGPUDevice};

use super::bindings::codegen::Bindings::WebGPUBinding::GPUDeviceLostReason;
use super::bindings::trace::HashMapTracedValues;
use crate::dom::bindings::cell::{DomRefCell, RefMut};
use crate::dom::bindings::codegen::Bindings::BroadcastChannelBinding::BroadcastChannelMethods;
Expand Down Expand Up @@ -3093,16 +3094,25 @@ impl GlobalScope {
.insert(device.id(), Dom::from_ref(device));
}

pub fn remove_gpu_device(&self, device: WebGPUDevice) {
let _ = self.gpu_devices.borrow_mut().remove(&device);
pub fn gpu_device_lost(&self, device: WebGPUDevice, reason: DeviceLostReason, msg: String) {
let reason = match reason {
DeviceLostReason::Unknown => GPUDeviceLostReason::Unknown,
DeviceLostReason::Destroyed => GPUDeviceLostReason::Destroyed,
};
let _ac = enter_realm(&*self);
self.gpu_devices
.borrow_mut()
.remove(&device)
.expect("GPUDevice should still exists")
.lose(reason, msg);
}

pub fn handle_uncaptured_gpu_error(&self, device: WebGPUDevice, error: webgpu::Error) {
self.gpu_devices
.borrow()
.get(&device)
.expect("GPUDevice not found")
.fire_uncaptured_error(error);
if let Some(gpu_device) = self.gpu_devices.borrow().get(&device) {
gpu_device.fire_uncaptured_error(error);
} else {
warn!("Recived error for lost GPUDevice!")
}
}

pub fn handle_gamepad_event(&self, gamepad_event: GamepadEvent) {
Expand Down
42 changes: 21 additions & 21 deletions components/script/dom/gpudevice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,9 @@ pub struct GPUDevice {
#[no_trace]
device: webgpu::WebGPUDevice,
default_queue: Dom<GPUQueue>,
/// <https://gpuweb.github.io/gpuweb/#dom-gpudevice-lost>
#[ignore_malloc_size_of = "promises are hard"]
lost_promise: DomRefCell<Option<Rc<Promise>>>,
lost_promise: DomRefCell<Rc<Promise>>,
valid: Cell<bool>,
}

Expand All @@ -100,6 +101,7 @@ impl GPUDevice {
device: webgpu::WebGPUDevice,
queue: &GPUQueue,
label: String,
lost_promise: Rc<Promise>,
) -> Self {
Self {
eventtarget: EventTarget::new_inherited(),
Expand All @@ -111,7 +113,7 @@ impl GPUDevice {
label: DomRefCell::new(USVString::from(label)),
device,
default_queue: Dom::from_ref(queue),
lost_promise: DomRefCell::new(None),
lost_promise: DomRefCell::new(lost_promise),
valid: Cell::new(true),
}
}
Expand All @@ -131,9 +133,18 @@ impl GPUDevice {
let queue = GPUQueue::new(global, channel.clone(), queue);
let limits = GPUSupportedLimits::new(global, limits);
let features = GPUSupportedFeatures::Constructor(global, None, features).unwrap();
let lost_promise = Promise::new(global);
let device = reflect_dom_object(
Box::new(GPUDevice::new_inherited(
channel, adapter, extensions, &features, &limits, device, &queue, label,
channel,
adapter,
extensions,
&features,
&limits,
device,
&queue,
label,
lost_promise,
)),
global,
);
Expand Down Expand Up @@ -206,18 +217,11 @@ impl GPUDevice {
}

/// <https://gpuweb.github.io/gpuweb/#lose-the-device>
pub fn lose(&self, reason: GPUDeviceLostReason) {
if let Some(ref lost_promise) = *self.lost_promise.borrow() {
let global = &self.global();
let msg = match reason {
GPUDeviceLostReason::Unknown => "Unknown reason for your device loss.",
GPUDeviceLostReason::Destroyed => {
"Device self-destruction sequence activated successfully!"
},
};
let lost = GPUDeviceLostInfo::new(global, msg.into(), reason);
lost_promise.resolve_native(&*lost);
}
pub fn lose(&self, reason: GPUDeviceLostReason, msg: String) {
let ref lost_promise = *self.lost_promise.borrow();
let global = &self.global();
let lost = GPUDeviceLostInfo::new(global, msg.into(), reason);
lost_promise.resolve_native(&*lost);
}
}

Expand Down Expand Up @@ -248,10 +252,8 @@ impl GPUDeviceMethods for GPUDevice {
}

/// <https://gpuweb.github.io/gpuweb/#dom-gpudevice-lost>
fn GetLost(&self, comp: InRealm) -> Fallible<Rc<Promise>> {
let promise = Promise::new_in_current_realm(comp);
*self.lost_promise.borrow_mut() = Some(promise.clone());
Ok(promise)
fn Lost(&self) -> Rc<Promise> {
self.lost_promise.borrow().clone()
}

/// <https://gpuweb.github.io/gpuweb/#dom-gpudevice-createbuffer>
Expand Down Expand Up @@ -1000,8 +1002,6 @@ impl GPUDeviceMethods for GPUDevice {
if self.valid.get() {
self.valid.set(false);

self.lose(GPUDeviceLostReason::Destroyed);

if let Err(e) = self
.channel
.0
Expand Down
3 changes: 1 addition & 2 deletions components/script/dom/webidls/WebGPU.webidl
Original file line number Diff line number Diff line change
Expand Up @@ -1075,14 +1075,13 @@ enum GPUDeviceLostReason {
"destroyed",
};

[Exposed=(Window, DedicatedWorker), Pref="dom.webgpu.enabled"]
[Exposed=(Window, Worker), Pref="dom.webgpu.enabled"]
interface GPUDeviceLostInfo {
readonly attribute GPUDeviceLostReason reason;
readonly attribute DOMString message;
};

partial interface GPUDevice {
[Throws]
readonly attribute Promise<GPUDeviceLostInfo> lost;
};

Expand Down
6 changes: 4 additions & 2 deletions components/script/script_thread.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2424,12 +2424,14 @@ impl ScriptThread {
WebGPUMsg::FreeTexture(id) => self.gpu_id_hub.lock().kill_texture_id(id),
WebGPUMsg::FreeTextureView(id) => self.gpu_id_hub.lock().kill_texture_view_id(id),
WebGPUMsg::Exit => *self.webgpu_port.borrow_mut() = None,
WebGPUMsg::CleanDevice {
WebGPUMsg::DeviceLost {
pipeline_id,
device,
reason,
msg,
} => {
let global = self.documents.borrow().find_global(pipeline_id).unwrap();
global.remove_gpu_device(device);
global.gpu_device_lost(device, reason, msg);
},
WebGPUMsg::UncapturedError {
device,
Expand Down
26 changes: 22 additions & 4 deletions components/webgpu/poll_thread.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
//! This is roughly based on <https://github.com/LucentFlux/wgpu-async/blob/1322c7e3fcdfc1865a472c7bbbf0e2e06dcf4da8/src/wgpu_future.rs>

use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::Arc;
use std::sync::{Arc, Mutex, MutexGuard};
use std::thread::JoinHandle;

use log::warn;
Expand Down Expand Up @@ -40,14 +40,25 @@ pub(crate) struct Poller {
is_done: Arc<AtomicBool>,
/// Handle to the WGPU poller thread (to be used for unparking the thread)
handle: Option<JoinHandle<()>>,
/// Lock for device maintain calls (in poll_all_devices and queue_submit)
///
/// This is workaround for wgpu deadlocks: https://github.com/gfx-rs/wgpu/issues/5572
lock: Arc<Mutex<()>>,
}

#[inline]
fn poll_all_devices(global: &Arc<Global>, more_work: &mut bool, force_wait: bool) {
fn poll_all_devices(
global: &Arc<Global>,
more_work: &mut bool,
force_wait: bool,
lock: &Mutex<()>,
) {
let _guard = lock.lock().unwrap();
match global.poll_all_devices(force_wait) {
Ok(all_queue_empty) => *more_work = !all_queue_empty,
Err(e) => warn!("Poller thread got `{e}` on poll_all_devices."),
}
// drop guard
}

impl Poller {
Expand All @@ -56,9 +67,11 @@ impl Poller {
let is_done = Arc::new(AtomicBool::new(false));
let work = work_count.clone();
let done = is_done.clone();
let lock = Arc::new(Mutex::new(()));
Self {
work_count,
is_done,
lock: Arc::clone(&lock),
handle: Some(
std::thread::Builder::new()
.name("WGPU poller".into())
Expand All @@ -69,9 +82,9 @@ impl Poller {
// so every `ẁake` (even spurious) will do at least one poll.
// this is mostly useful for stuff that is deferred
// to maintain calls in wgpu (device resource destruction)
poll_all_devices(&global, &mut more_work, false);
poll_all_devices(&global, &mut more_work, false, &lock);
while more_work || work.load(Ordering::Acquire) != 0 {
poll_all_devices(&global, &mut more_work, true);
poll_all_devices(&global, &mut more_work, true, &lock);
}
std::thread::park(); //TODO: should we use timeout here
}
Expand Down Expand Up @@ -101,6 +114,11 @@ impl Poller {
.thread()
.unpark();
}

/// Lock for device maintain calls (in poll_all_devices and queue_submit)
pub(crate) fn lock(&self) -> MutexGuard<()> {
self.lock.lock().unwrap()
}
}

impl Drop for Poller {
Expand Down
15 changes: 12 additions & 3 deletions components/webgpu/script_messages.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ use crate::wgc::id::{
ShaderModuleId, StagingBufferId, SurfaceId, TextureId, TextureViewId,
};

/// <https://gpuweb.github.io/gpuweb/#enumdef-gpudevicelostreason>
#[derive(Clone, Copy, Debug, Deserialize, Serialize)]
pub enum DeviceLostReason {
Unknown,
Destroyed,
}

#[derive(Clone, Debug, Deserialize, Serialize)]
pub enum WebGPUMsg {
FreeAdapter(AdapterId),
Expand All @@ -34,14 +41,16 @@ pub enum WebGPUMsg {
FreeRenderBundle(RenderBundleId),
FreeStagingBuffer(StagingBufferId),
FreeQuerySet(QuerySetId),
CleanDevice {
UncapturedError {
device: WebGPUDevice,
pipeline_id: PipelineId,
error: Error,
},
UncapturedError {
DeviceLost {
device: WebGPUDevice,
pipeline_id: PipelineId,
error: Error,
reason: DeviceLostReason,
msg: String,
},
Exit,
}
Loading

0 comments on commit cbc9304

Please sign in to comment.