Convert ram_list to RCU

Allow "unlocked" reads of the ram_list by using an RCU-enabled QLIST. The ramlist mutex is kept. call_rcu callbacks are run with the iothread lock taken, but that may change in the future. Writers still take the ramlist mutex, but they no longer need to assume that the iothread lock is taken. Readers of the list, instead, no longer require either the iothread or ramlist mutex, but they need to use rcu_read_lock() and rcu_read_unlock(). One place in arch_init.c was downgrading from write side to read side like this: qemu_mutex_lock_iothread() qemu_mutex_lock_ramlist() ... qemu_mutex_unlock_iothread() ... qemu_mutex_unlock_ramlist() and the equivalent idiom is: qemu_mutex_lock_ramlist() rcu_read_lock() ... qemu_mutex_unlock_ramlist() ... rcu_read_unlock() Reviewed-by: Fam Zheng <famz@redhat.com> Signed-off-by: Mike Day <ncmike@ncultra.org> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
qemu · Feb 16, 2015 · 0dc3f44 · 0dc3f44
1 parent 0d53d9f
commit 0dc3f44
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 52 deletions.
diff --git a/arch_init.c b/arch_init.c
@@ -52,6 +52,7 @@
 #include "exec/ram_addr.h"
 #include "hw/acpi/acpi.h"
 #include "qemu/host-utils.h"
+#include "qemu/rcu_queue.h"
 
 #ifdef DEBUG_ARCH_INIT
 #define DPRINTF(fmt, ...) \
@@ -523,9 +524,12 @@ static void migration_bitmap_sync(void)
     trace_migration_bitmap_sync_start();
     address_space_sync_dirty_bitmap(&address_space_memory);
 
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
+    rcu_read_lock();
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
         migration_bitmap_sync_range(block->mr->ram_addr, block->used_length);
     }
+    rcu_read_unlock();
+
     trace_migration_bitmap_sync_end(migration_dirty_pages
                                     - num_dirty_pages_init);
     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
@@ -648,6 +652,8 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
 /*
  * ram_find_and_save_block: Finds a page to send and sends it to f
  *
+ * Called within an RCU critical section.
+ *
  * Returns:  The number of bytes written.
  *           0 means no dirty pages
  */
@@ -661,7 +667,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
     MemoryRegion *mr;
 
     if (!block)
-        block = QLIST_FIRST(&ram_list.blocks);
+        block = QLIST_FIRST_RCU(&ram_list.blocks);
 
     while (true) {
         mr = block->mr;
@@ -672,9 +678,9 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
         }
         if (offset >= block->used_length) {
             offset = 0;
-            block = QLIST_NEXT(block, next);
+            block = QLIST_NEXT_RCU(block, next);
             if (!block) {
-                block = QLIST_FIRST(&ram_list.blocks);
+                block = QLIST_FIRST_RCU(&ram_list.blocks);
                 complete_round = true;
                 ram_bulk_stage = false;
             }
@@ -728,10 +734,10 @@ uint64_t ram_bytes_total(void)
     RAMBlock *block;
     uint64_t total = 0;
 
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
+    rcu_read_lock();
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
         total += block->used_length;
-    }
-
+    rcu_read_unlock();
     return total;
 }
 
@@ -777,6 +783,13 @@ static void reset_ram_globals(void)
 
 #define MAX_WAIT 50 /* ms, half buffered_file limit */
 
+
+/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
+ * long-running RCU critical section.  When rcu-reclaims in the code
+ * start to become numerous it will be necessary to reduce the
+ * granularity of these critical sections.
+ */
+
 static int ram_save_setup(QEMUFile *f, void *opaque)
 {
     RAMBlock *block;
@@ -820,6 +833,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
     /* iothread lock needed for ram_list.dirty_memory[] */
     qemu_mutex_lock_iothread();
     qemu_mutex_lock_ramlist();
+    rcu_read_lock();
     bytes_transferred = 0;
     reset_ram_globals();
 
@@ -832,7 +846,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
      * gaps due to alignment or unplugs.
      */
     migration_dirty_pages = 0;
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
         uint64_t block_pages;
 
         block_pages = block->used_length >> TARGET_PAGE_BITS;
@@ -841,17 +855,18 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 
     memory_global_dirty_log_start();
     migration_bitmap_sync();
+    qemu_mutex_unlock_ramlist();
     qemu_mutex_unlock_iothread();
 
     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
 
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
         qemu_put_byte(f, strlen(block->idstr));
         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
         qemu_put_be64(f, block->used_length);
     }
 
-    qemu_mutex_unlock_ramlist();
+    rcu_read_unlock();
 
     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
@@ -868,12 +883,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
     int64_t t0;
     int total_sent = 0;
 
-    qemu_mutex_lock_ramlist();
-
+    rcu_read_lock();
     if (ram_list.version != last_version) {
         reset_ram_globals();
     }
 
+    /* Read version before ram_list.blocks */
+    smp_rmb();
+
     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
 
     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
@@ -904,8 +921,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
         }
         i++;
     }
-
-    qemu_mutex_unlock_ramlist();
+    rcu_read_unlock();
 
     /*
      * Must occur before EOS (or any QEMUFile operation)
@@ -933,7 +949,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 /* Called with iothread lock */
 static int ram_save_complete(QEMUFile *f, void *opaque)
 {
-    qemu_mutex_lock_ramlist();
+    rcu_read_lock();
+
     migration_bitmap_sync();
 
     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
@@ -955,7 +972,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
     migration_end();
 
-    qemu_mutex_unlock_ramlist();
+    rcu_read_unlock();
     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
 
     return 0;
@@ -969,7 +986,9 @@ static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
 
     if (remaining_size < max_size) {
         qemu_mutex_lock_iothread();
+        rcu_read_lock();
         migration_bitmap_sync();
+        rcu_read_unlock();
         qemu_mutex_unlock_iothread();
         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
     }
@@ -1011,6 +1030,9 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
     return 0;
 }
 
+/* Must be called from within a rcu critical section.
+ * Returns a pointer from within the RCU-protected ram_list.
+ */
 static inline void *host_from_stream_offset(QEMUFile *f,
                                             ram_addr_t offset,
                                             int flags)
@@ -1032,7 +1054,7 @@ static inline void *host_from_stream_offset(QEMUFile *f,
     qemu_get_buffer(f, (uint8_t *)id, len);
     id[len] = 0;
 
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
         if (!strncmp(id, block->idstr, sizeof(id)) &&
             block->max_length > offset) {
             return memory_region_get_ram_ptr(block->mr) + offset;
@@ -1065,6 +1087,12 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
         ret = -EINVAL;
     }
 
+    /* This RCU critical section can be very long running.
+     * When RCU reclaims in the code start to become numerous,
+     * it will be necessary to reduce the granularity of this
+     * critical section.
+     */
+    rcu_read_lock();
     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
         ram_addr_t addr, total_ram_bytes;
         void *host;
@@ -1089,7 +1117,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                 id[len] = 0;
                 length = qemu_get_be64(f);
 
-                QLIST_FOREACH(block, &ram_list.blocks, next) {
+                QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
                     if (!strncmp(id, block->idstr, sizeof(id))) {
                         if (length != block->used_length) {
                             Error *local_err = NULL;
@@ -1163,6 +1191,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
         }
     }
 
+    rcu_read_unlock();
     DPRINTF("Completed load of VM with exit code %d seq iteration "
             "%" PRIu64 "\n", ret, seq_iter);
     return ret;