std: Avoid locks during TLS destruction on Windows

Gecko recently had a bug reported [1] with a deadlock in the Rust TLS implementation for Windows. TLS destructors are implemented in a sort of ad-hoc fashion on Windows as it doesn't natively support destructors for TLS keys. To work around this the runtime manages a list of TLS destructors and registers a hook to get run whenever a thread exits. When a thread exits it takes a look at the list and runs all destructors. Unfortunately it turns out that there's a lock which is held when our "at thread exit" callback is run. The callback then attempts to acquire a lock protecting the list of TLS destructors. Elsewhere in the codebase while we hold a lock over the TLS destructors we try to acquire the same lock held first before our special callback is run. And as a result, deadlock! This commit sidesteps the issue with a few small refactorings: * Removed support for destroying a TLS key on Windows. We don't actually ever exercise this as a public-facing API, and it's only used during `lazy_init` during racy situations. To handle that we just synchronize `lazy_init` globally on Windows so we never have to call `destroy`. * With no need to support removal the global synchronized `Vec` was tranformed to a lock-free linked list. With the removal of locks this means that iteration no long requires a lock and as such we won't run into the deadlock problem mentioned above. Note that it's still a general problem that you have to be extra super careful in TLS destructors. For example no code which runs a TLS destructor on Windows can call back into the Windows API to do a dynamic library lookup. Unfortunately I don't know of a great way around that, but this at least fixes the immediate problem that Gecko was seeing which is that with "well behaved" destructors the system would still deadlock! [1]: https://bugzilla.mozilla.org/show_bug.cgi?id=1358151
rust-lang · Apr 25, 2017 · fffe254 · fffe254
1 parent c7e724a
commit fffe254
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 114 deletions.
diff --git a/src/libstd/sys/unix/thread_local.rs b/src/libstd/sys/unix/thread_local.rs
@@ -38,3 +38,8 @@ pub unsafe fn destroy(key: Key) {
     let r = libc::pthread_key_delete(key);
     debug_assert_eq!(r, 0);
 }
+
+#[inline]
+pub fn requires_synchronized_create() -> bool {
+    false
+}
diff --git a/src/libstd/sys/windows/c.rs b/src/libstd/sys/windows/c.rs
@@ -935,7 +935,6 @@ extern "system" {
                           args: *const c_void)
                           -> DWORD;
     pub fn TlsAlloc() -> DWORD;
-    pub fn TlsFree(dwTlsIndex: DWORD) -> BOOL;
     pub fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
     pub fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
     pub fn GetLastError() -> DWORD;

diff --git a/src/libstd/sys/windows/thread_local.rs b/src/libstd/sys/windows/thread_local.rs
@@ -8,10 +8,11 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use mem;
 use ptr;
+use sync::atomic::AtomicPtr;
+use sync::atomic::Ordering::SeqCst;
 use sys::c;
-use sys_common::mutex::Mutex;
-use sys_common;
 
 pub type Key = c::DWORD;
 pub type Dtor = unsafe extern fn(*mut u8);
@@ -34,8 +35,6 @@ pub type Dtor = unsafe extern fn(*mut u8);
 // * All TLS destructors are tracked by *us*, not the windows runtime. This
 //   means that we have a global list of destructors for each TLS key that
 //   we know about.
-// * When a TLS key is destroyed, we're sure to remove it from the dtor list
-//   if it's in there.
 // * When a thread exits, we run over the entire list and run dtors for all
 //   non-null keys. This attempts to match Unix semantics in this regard.
 //
@@ -50,13 +49,6 @@ pub type Dtor = unsafe extern fn(*mut u8);
 // [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base
 //                        /threading/thread_local_storage_win.cc#L42
 
-// NB these are specifically not types from `std::sync` as they currently rely
-// on poisoning and this module needs to operate at a lower level than requiring
-// the thread infrastructure to be in place (useful on the borders of
-// initialization/destruction).
-static DTOR_LOCK: Mutex = Mutex::new();
-static mut DTORS: *mut Vec<(Key, Dtor)> = ptr::null_mut();
-
 // -------------------------------------------------------------------------
 // Native bindings
 //
@@ -85,81 +77,64 @@ pub unsafe fn get(key: Key) -> *mut u8 {
 }
 
 #[inline]
-pub unsafe fn destroy(key: Key) {
-    if unregister_dtor(key) {
-        // FIXME: Currently if a key has a destructor associated with it we
-        // can't actually ever unregister it. If we were to
-        // unregister it, then any key destruction would have to be
-        // serialized with respect to actually running destructors.
-        //
-        // We want to avoid a race where right before run_dtors runs
-        // some destructors TlsFree is called. Allowing the call to
-        // TlsFree would imply that the caller understands that *all
-        // known threads* are not exiting, which is quite a difficult
-        // thing to know!
-        //
-        // For now we just leak all keys with dtors to "fix" this.
-        // Note that source [2] above shows precedent for this sort
-        // of strategy.
-    } else {
-        let r = c::TlsFree(key);
-        debug_assert!(r != 0);
-    }
+pub unsafe fn destroy(_key: Key) {
+    rtabort!("can't destroy tls keys on windows")
+}
+
+#[inline]
+pub fn requires_synchronized_create() -> bool {
+    true
 }
 
 // -------------------------------------------------------------------------
 // Dtor registration
 //
-// These functions are associated with registering and unregistering
-// destructors. They're pretty simple, they just push onto a vector and scan
-// a vector currently.
+// Windows has no native support for running destructors so we manage our own
+// list of destructors to keep track of how to destroy keys. We then install a
+// callback later to get invoked whenever a thread exits, running all
+// appropriate destructors.
 //
-// FIXME: This could probably be at least a little faster with a BTree.
-
-unsafe fn init_dtors() {
-    if !DTORS.is_null() { return }
+// Currently unregistration from this list is not supported. A destructor can be
+// registered but cannot be unregistered. There's various simplifying reasons
+// for doing this, the big ones being:
+//
+// 1. Currently we don't even support deallocating TLS keys, so normal operation
+//    doesn't need to deallocate a destructor.
+// 2. There is no point in time where we know we can unregister a destructor
+//    because it could always be getting run by some remote thread.
+//
+// Typically processes have a statically known set of TLS keys which is pretty
+// small, and we'd want to keep this memory alive for the whole process anyway
+// really.
+//
+// Perhaps one day we can fold the `Box` here into a static allocation,
+// expanding the `StaticKey` structure to contain not only a slot for the TLS
+// key but also a slot for the destructor queue on windows. An optimization for
+// another day!
 
-    let dtors = box Vec::<(Key, Dtor)>::new();
+static DTORS: AtomicPtr<Node> = AtomicPtr::new(ptr::null_mut());
 
-    let res = sys_common::at_exit(move|| {
-        DTOR_LOCK.lock();
-        let dtors = DTORS;
-        DTORS = 1 as *mut _;
-        Box::from_raw(dtors);
-        assert!(DTORS as usize == 1); // can't re-init after destructing
-        DTOR_LOCK.unlock();
-    });
-    if res.is_ok() {
-        DTORS = Box::into_raw(dtors);
-    } else {
-        DTORS = 1 as *mut _;
-    }
+struct Node {
+    dtor: Dtor,
+    key: Key,
+    next: *mut Node,
 }
 
 unsafe fn register_dtor(key: Key, dtor: Dtor) {
-    DTOR_LOCK.lock();
-    init_dtors();
-    assert!(DTORS as usize != 0);
-    assert!(DTORS as usize != 1,
-            "cannot create new TLS keys after the main thread has exited");
-    (*DTORS).push((key, dtor));
-    DTOR_LOCK.unlock();
-}
+    let mut node = Box::new(Node {
+        key: key,
+        dtor: dtor,
+        next: ptr::null_mut(),
+    });
 
-unsafe fn unregister_dtor(key: Key) -> bool {
-    DTOR_LOCK.lock();
-    init_dtors();
-    assert!(DTORS as usize != 0);
-    assert!(DTORS as usize != 1,
-            "cannot unregister destructors after the main thread has exited");
-    let ret = {
-        let dtors = &mut *DTORS;
-        let before = dtors.len();
-        dtors.retain(|&(k, _)| k != key);
-        dtors.len() != before
-    };
-    DTOR_LOCK.unlock();
-    ret
+    let mut head = DTORS.load(SeqCst);
+    loop {
+        node.next = head;
+        match DTORS.compare_exchange(head, &mut *node, SeqCst, SeqCst) {
+            Ok(_) => return mem::forget(node),
+            Err(cur) => head = cur,
+        }
+    }
 }
 
 // -------------------------------------------------------------------------
@@ -196,16 +171,12 @@ unsafe fn unregister_dtor(key: Key) -> bool {
 // # Ok, what's up with running all these destructors?
 //
 // This will likely need to be improved over time, but this function
-// attempts a "poor man's" destructor callback system. To do this we clone a
-// local copy of the dtor list to start out with. This is our fudgy attempt
-// to not hold the lock while destructors run and not worry about the list
-// changing while we're looking at it.
-//
-// Once we've got a list of what to run, we iterate over all keys, check
-// their values, and then run destructors if the values turn out to be non
-// null (setting them to null just beforehand). We do this a few times in a
-// loop to basically match Unix semantics. If we don't reach a fixed point
-// after a short while then we just inevitably leak something most likely.
+// attempts a "poor man's" destructor callback system. Once we've got a list
+// of what to run, we iterate over all keys, check their values, and then run
+// destructors if the values turn out to be non null (setting them to null just
+// beforehand). We do this a few times in a loop to basically match Unix
+// semantics. If we don't reach a fixed point after a short while then we just
+// inevitably leak something most likely.
 //
 // # The article mentions weird stuff about "/INCLUDE"?
 //
@@ -259,25 +230,21 @@ unsafe extern "system" fn on_tls_callback(h: c::LPVOID,
 unsafe fn run_dtors() {
     let mut any_run = true;
     for _ in 0..5 {
-        if !any_run { break }
+        if !any_run {
+            break
+        }
         any_run = false;
-        let dtors = {
-            DTOR_LOCK.lock();
-            let ret = if DTORS as usize <= 1 {
-                Vec::new()
-            } else {
-                (*DTORS).iter().map(|s| *s).collect()
-            };
-            DTOR_LOCK.unlock();
-            ret
-        };
-        for &(key, dtor) in &dtors {
-            let ptr = c::TlsGetValue(key);
+        let mut cur = DTORS.load(SeqCst);
+        while !cur.is_null() {
+            let ptr = c::TlsGetValue((*cur).key);
+
             if !ptr.is_null() {
-                c::TlsSetValue(key, ptr::null_mut());
-                dtor(ptr as *mut _);
+                c::TlsSetValue((*cur).key, ptr::null_mut());
+                ((*cur).dtor)(ptr as *mut _);
                 any_run = true;
             }
+
+            cur = (*cur).next;
         }
     }
 }
diff --git a/src/libstd/sys_common/thread_local.rs b/src/libstd/sys_common/thread_local.rs
@@ -61,6 +61,7 @@
 use sync::atomic::{self, AtomicUsize, Ordering};
 
 use sys::thread_local as imp;
+use sys_common::mutex::Mutex;
 
 /// A type for TLS keys that are statically allocated.
 ///
@@ -145,20 +146,6 @@ impl StaticKey {
     #[inline]
     pub unsafe fn set(&self, val: *mut u8) { imp::set(self.key(), val) }
 
-    /// Deallocates this OS TLS key.
-    ///
-    /// This function is unsafe as there is no guarantee that the key is not
-    /// currently in use by other threads or will not ever be used again.
-    ///
-    /// Note that this does *not* run the user-provided destructor if one was
-    /// specified at definition time. Doing so must be done manually.
-    pub unsafe fn destroy(&self) {
-        match self.key.swap(0, Ordering::SeqCst) {
-            0 => {}
-            n => { imp::destroy(n as imp::Key) }
-        }
-    }
-
     #[inline]
     unsafe fn key(&self) -> imp::Key {
         match self.key.load(Ordering::Relaxed) {
@@ -168,6 +155,26 @@ impl StaticKey {
     }
 
     unsafe fn lazy_init(&self) -> usize {
+        // Currently the Windows implementation of TLS is pretty hairy, and
+        // it greatly simplifies creation if we just synchronize everything.
+        //
+        // Additionally a 0-index of a tls key hasn't been seen on windows, so
+        // we just simplify the whole branch.
+        if imp::requires_synchronized_create() {
+            static INIT_LOCK: Mutex = Mutex::new();
+            INIT_LOCK.lock();
+            let mut key = self.key.load(Ordering::SeqCst);
+            if key != 0 {
+                key = imp::create(self.dtor) as usize;
+                if key != 0 {
+                    self.key.store(key, Ordering::SeqCst);
+                }
+            }
+            INIT_LOCK.unlock();
+            assert!(key != 0);
+            return key
+        }
+
         // POSIX allows the key created here to be 0, but the compare_and_swap
         // below relies on using 0 as a sentinel value to check who won the
         // race to set the shared TLS key. As far as I know, there is no