From d1a9f2d12fcfc942924956fbe321aedf4226ccb7 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Mon, 24 Oct 2016 09:49:25 -0700
Subject: [PATCH 01/37] atomics: Add parameters to macros

Making these functional rather than object macros will
prevent later problems with complex macro expansion.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/qemu/atomic.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index c09fce704f4e..94be1d019652 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -335,11 +335,11 @@
 /* Provide shorter names for GCC atomic builtins.  */
 #define atomic_fetch_inc(ptr)  __sync_fetch_and_add(ptr, 1)
 #define atomic_fetch_dec(ptr)  __sync_fetch_and_add(ptr, -1)
-#define atomic_fetch_add       __sync_fetch_and_add
-#define atomic_fetch_sub       __sync_fetch_and_sub
-#define atomic_fetch_and       __sync_fetch_and_and
-#define atomic_fetch_or        __sync_fetch_and_or
-#define atomic_cmpxchg         __sync_val_compare_and_swap
+#define atomic_fetch_add(ptr, n) __sync_fetch_and_add(ptr, n)
+#define atomic_fetch_sub(ptr, n) __sync_fetch_and_sub(ptr, n)
+#define atomic_fetch_and(ptr, n) __sync_fetch_and_and(ptr, n)
+#define atomic_fetch_or(ptr, n) __sync_fetch_and_or(ptr, n)
+#define atomic_cmpxchg(ptr, old, new) __sync_val_compare_and_swap(ptr, old, new)
 
 /* And even shorter names that return void.  */
 #define atomic_inc(ptr)        ((void) __sync_fetch_and_add(ptr, 1))

From 61696ddbdc74263ddb6869856772cfe355a5d3bd Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:01:53 -0400
Subject: [PATCH 02/37] atomics: add atomic_xor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This paves the way for upcoming work.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1467054136-10430-8-git-send-email-cota@braap.org>
---
 include/qemu/atomic.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 94be1d019652..3b72d935bb45 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -173,6 +173,7 @@
 #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST)
 
 /* And even shorter names that return void.  */
 #define atomic_inc(ptr)    ((void) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST))
@@ -181,6 +182,7 @@
 #define atomic_sub(ptr, n) ((void) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST))
 #define atomic_and(ptr, n) ((void) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST))
 #define atomic_or(ptr, n)  ((void) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST))
+#define atomic_xor(ptr, n) ((void) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST))
 
 #else /* __ATOMIC_RELAXED */
 
@@ -339,6 +341,7 @@
 #define atomic_fetch_sub(ptr, n) __sync_fetch_and_sub(ptr, n)
 #define atomic_fetch_and(ptr, n) __sync_fetch_and_and(ptr, n)
 #define atomic_fetch_or(ptr, n) __sync_fetch_and_or(ptr, n)
+#define atomic_fetch_xor(ptr, n) __sync_fetch_and_xor(ptr, n)
 #define atomic_cmpxchg(ptr, old, new) __sync_val_compare_and_swap(ptr, old, new)
 
 /* And even shorter names that return void.  */
@@ -348,6 +351,7 @@
 #define atomic_sub(ptr, n)     ((void) __sync_fetch_and_sub(ptr, n))
 #define atomic_and(ptr, n)     ((void) __sync_fetch_and_and(ptr, n))
 #define atomic_or(ptr, n)      ((void) __sync_fetch_and_or(ptr, n))
+#define atomic_xor(ptr, n)     ((void) __sync_fetch_and_xor(ptr, n))
 
 #endif /* __ATOMIC_RELAXED */
 

From 83d0c719f837724d9e3963b078211b2242bdd2a5 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:01:54 -0400
Subject: [PATCH 03/37] atomics: add atomic_op_fetch variants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This paves the way for upcoming work.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1467054136-10430-9-git-send-email-cota@braap.org>
---
 include/qemu/atomic.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 3b72d935bb45..a26a36bb4982 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -175,6 +175,14 @@
 #define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST)
 
+#define atomic_inc_fetch(ptr)    __atomic_add_fetch(ptr, 1, __ATOMIC_SEQ_CST)
+#define atomic_dec_fetch(ptr)    __atomic_sub_fetch(ptr, 1, __ATOMIC_SEQ_CST)
+#define atomic_add_fetch(ptr, n) __atomic_add_fetch(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_sub_fetch(ptr, n) __atomic_sub_fetch(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_and_fetch(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_or_fetch(ptr, n)  __atomic_or_fetch(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_xor_fetch(ptr, n) __atomic_xor_fetch(ptr, n, __ATOMIC_SEQ_CST)
+
 /* And even shorter names that return void.  */
 #define atomic_inc(ptr)    ((void) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST))
 #define atomic_dec(ptr)    ((void) __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST))
@@ -342,6 +350,15 @@
 #define atomic_fetch_and(ptr, n) __sync_fetch_and_and(ptr, n)
 #define atomic_fetch_or(ptr, n) __sync_fetch_and_or(ptr, n)
 #define atomic_fetch_xor(ptr, n) __sync_fetch_and_xor(ptr, n)
+
+#define atomic_inc_fetch(ptr)  __sync_add_and_fetch(ptr, 1)
+#define atomic_dec_fetch(ptr)  __sync_add_and_fetch(ptr, -1)
+#define atomic_add_fetch(ptr, n) __sync_add_and_fetch(ptr, n)
+#define atomic_sub_fetch(ptr, n) __sync_sub_and_fetch(ptr, n)
+#define atomic_and_fetch(ptr, n) __sync_and_and_fetch(ptr, n)
+#define atomic_or_fetch(ptr, n) __sync_or_and_fetch(ptr, n)
+#define atomic_xor_fetch(ptr, n) __sync_xor_and_fetch(ptr, n)
+
 #define atomic_cmpxchg(ptr, old, new) __sync_val_compare_and_swap(ptr, old, new)
 
 /* And even shorter names that return void.  */

From 84bca3927b36fb1d9a2ca85cbbdf9023d2b84678 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Mon, 24 Oct 2016 10:29:32 -0700
Subject: [PATCH 04/37] atomics: Add __nocheck atomic operations

While the check against sizeof(void *) is appropriate for
normal usage within qemu, there are places in which we want
wider operaions and have checked for their existance.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/qemu/atomic.h | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index a26a36bb4982..878fa0700d6e 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -99,15 +99,21 @@
  * no effect on the generated code but not using the atomic primitives
  * will get flagged by sanitizers as a violation.
  */
+#define atomic_read__nocheck(ptr) \
+    __atomic_load_n(ptr, __ATOMIC_RELAXED)
+
 #define atomic_read(ptr)                              \
     ({                                                \
     QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
-    __atomic_load_n(ptr, __ATOMIC_RELAXED);           \
+    atomic_read__nocheck(ptr);                        \
     })
 
+#define atomic_set__nocheck(ptr, i) \
+    __atomic_store_n(ptr, i, __ATOMIC_RELAXED)
+
 #define atomic_set(ptr, i)  do {                      \
     QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
-    __atomic_store_n(ptr, i, __ATOMIC_RELAXED);       \
+    atomic_set__nocheck(ptr, i);                      \
 } while(0)
 
 /* See above: most compilers currently treat consume and acquire the
@@ -151,20 +157,27 @@
 
 /* All the remaining operations are fully sequentially consistent */
 
+#define atomic_xchg__nocheck(ptr, i)    ({                  \
+    __atomic_exchange_n(ptr, (i), __ATOMIC_SEQ_CST);        \
+})
+
 #define atomic_xchg(ptr, i)    ({                           \
     QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *));       \
-    __atomic_exchange_n(ptr, i, __ATOMIC_SEQ_CST);          \
+    atomic_xchg__nocheck(ptr, i);                           \
 })
 
 /* Returns the eventual value, failed or not */
-#define atomic_cmpxchg(ptr, old, new)                                   \
-    ({                                                                  \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *));                   \
+#define atomic_cmpxchg__nocheck(ptr, old, new)    ({                    \
     typeof_strip_qual(*ptr) _old = (old);                               \
     __atomic_compare_exchange_n(ptr, &_old, new, false,                 \
                               __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \
     _old;                                                               \
-    })
+})
+
+#define atomic_cmpxchg(ptr, old, new)    ({                             \
+    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *));                   \
+    atomic_cmpxchg__nocheck(ptr, old, new);                             \
+})
 
 /* Provide shorter names for GCC atomic builtins, return old value */
 #define atomic_fetch_inc(ptr)  __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST)
@@ -279,8 +292,11 @@
 /* These will only be atomic if the processor does the fetch or store
  * in a single issue memory operation
  */
-#define atomic_read(ptr)       (*(__typeof__(*ptr) volatile*) (ptr))
-#define atomic_set(ptr, i)     ((*(__typeof__(*ptr) volatile*) (ptr)) = (i))
+#define atomic_read__nocheck(p)   (*(__typeof__(*(p)) volatile*) (p))
+#define atomic_set__nocheck(p, i) ((*(__typeof__(*(p)) volatile*) (p)) = (i))
+
+#define atomic_read(ptr)       atomic_read__nocheck(ptr)
+#define atomic_set(ptr, i)     atomic_set__nocheck(ptr,i)
 
 /**
  * atomic_rcu_read - reads a RCU-protected pointer to a local variable
@@ -341,6 +357,7 @@
 #define atomic_xchg(ptr, i)    (smp_mb(), __sync_lock_test_and_set(ptr, i))
 #endif
 #endif
+#define atomic_xchg__nocheck  atomic_xchg
 
 /* Provide shorter names for GCC atomic builtins.  */
 #define atomic_fetch_inc(ptr)  __sync_fetch_and_add(ptr, 1)
@@ -360,6 +377,7 @@
 #define atomic_xor_fetch(ptr, n) __sync_xor_and_fetch(ptr, n)
 
 #define atomic_cmpxchg(ptr, old, new) __sync_val_compare_and_swap(ptr, old, new)
+#define atomic_cmpxchg__nocheck(ptr, old, new)  atomic_cmpxchg(ptr, old, new)
 
 /* And even shorter names that return void.  */
 #define atomic_inc(ptr)        ((void) __sync_fetch_and_add(ptr, 1))

From 258dfaaad05a5fbe32a142b794e1df3e16501d0e Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Wed, 29 Jun 2016 15:48:03 -0700
Subject: [PATCH 05/37] exec: Avoid direct references to Int128 parts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 exec.c                |  4 ++--
 include/qemu/int128.h | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/exec.c b/exec.c
index 587b489eefd5..4c84389b565a 100644
--- a/exec.c
+++ b/exec.c
@@ -352,9 +352,9 @@ static inline bool section_covers_addr(const MemoryRegionSection *section,
     /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
      * the section must cover the entire address space.
      */
-    return section->size.hi ||
+    return int128_gethi(section->size) ||
            range_covers_byte(section->offset_within_address_space,
-                             section->size.lo, addr);
+                             int128_getlo(section->size), addr);
 }
 
 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index c5988813df4c..52aaf998113f 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -20,6 +20,16 @@ static inline uint64_t int128_get64(Int128 a)
     return a.lo;
 }
 
+static inline uint64_t int128_getlo(Int128 a)
+{
+    return a.lo;
+}
+
+static inline int64_t int128_gethi(Int128 a)
+{
+    return a.hi;
+}
+
 static inline Int128 int128_zero(void)
 {
     return int128_make64(0);

From 0846beb36641e8f0c3ee55a5bb84d468b653c852 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Wed, 29 Jun 2016 15:52:10 -0700
Subject: [PATCH 06/37] int128: Use __int128 if available
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/qemu/int128.h | 135 +++++++++++++++++++++++++++++++++++++++++-
 tests/test-int128.c   |  22 +++----
 2 files changed, 145 insertions(+), 12 deletions(-)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index 52aaf998113f..64a7aca60442 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -1,6 +1,138 @@
 #ifndef INT128_H
 #define INT128_H
 
+#ifdef CONFIG_INT128
+
+typedef __int128_t Int128;
+
+static inline Int128 int128_make64(uint64_t a)
+{
+    return a;
+}
+
+static inline uint64_t int128_get64(Int128 a)
+{
+    uint64_t r = a;
+    assert(r == a);
+    return r;
+}
+
+static inline uint64_t int128_getlo(Int128 a)
+{
+    return a;
+}
+
+static inline int64_t int128_gethi(Int128 a)
+{
+    return a >> 64;
+}
+
+static inline Int128 int128_zero(void)
+{
+    return 0;
+}
+
+static inline Int128 int128_one(void)
+{
+    return 1;
+}
+
+static inline Int128 int128_2_64(void)
+{
+    return (Int128)1 << 64;
+}
+
+static inline Int128 int128_exts64(int64_t a)
+{
+    return a;
+}
+
+static inline Int128 int128_and(Int128 a, Int128 b)
+{
+    return a & b;
+}
+
+static inline Int128 int128_rshift(Int128 a, int n)
+{
+    return a >> n;
+}
+
+static inline Int128 int128_add(Int128 a, Int128 b)
+{
+    return a + b;
+}
+
+static inline Int128 int128_neg(Int128 a)
+{
+    return -a;
+}
+
+static inline Int128 int128_sub(Int128 a, Int128 b)
+{
+    return a - b;
+}
+
+static inline bool int128_nonneg(Int128 a)
+{
+    return a >= 0;
+}
+
+static inline bool int128_eq(Int128 a, Int128 b)
+{
+    return a == b;
+}
+
+static inline bool int128_ne(Int128 a, Int128 b)
+{
+    return a != b;
+}
+
+static inline bool int128_ge(Int128 a, Int128 b)
+{
+    return a >= b;
+}
+
+static inline bool int128_lt(Int128 a, Int128 b)
+{
+    return a < b;
+}
+
+static inline bool int128_le(Int128 a, Int128 b)
+{
+    return a <= b;
+}
+
+static inline bool int128_gt(Int128 a, Int128 b)
+{
+    return a > b;
+}
+
+static inline bool int128_nz(Int128 a)
+{
+    return a != 0;
+}
+
+static inline Int128 int128_min(Int128 a, Int128 b)
+{
+    return a < b ? a : b;
+}
+
+static inline Int128 int128_max(Int128 a, Int128 b)
+{
+    return a > b ? a : b;
+}
+
+static inline void int128_addto(Int128 *a, Int128 b)
+{
+    *a += b;
+}
+
+static inline void int128_subfrom(Int128 *a, Int128 b)
+{
+    *a -= b;
+}
+
+#else /* !CONFIG_INT128 */
 
 typedef struct Int128 Int128;
 
@@ -153,4 +285,5 @@ static inline void int128_subfrom(Int128 *a, Int128 b)
     *a = int128_sub(*a, b);
 }
 
-#endif
+#endif /* CONFIG_INT128 */
+#endif /* INT128_H */
diff --git a/tests/test-int128.c b/tests/test-int128.c
index 4390123bd3cb..b86a3c76e615 100644
--- a/tests/test-int128.c
+++ b/tests/test-int128.c
@@ -41,7 +41,7 @@ static Int128 expand(uint32_t x)
     uint64_t l, h;
     l = expand16(x & 65535);
     h = expand16(x >> 16);
-    return (Int128) {l, h};
+    return (Int128) int128_make128(l, h);
 };
 
 static void test_and(void)
@@ -54,8 +54,8 @@ static void test_and(void)
             Int128 b = expand(tests[j]);
             Int128 r = expand(tests[i] & tests[j]);
             Int128 s = int128_and(a, b);
-            g_assert_cmpuint(r.lo, ==, s.lo);
-            g_assert_cmpuint(r.hi, ==, s.hi);
+            g_assert_cmpuint(int128_getlo(r), ==, int128_getlo(s));
+            g_assert_cmpuint(int128_gethi(r), ==, int128_gethi(s));
         }
     }
 }
@@ -70,8 +70,8 @@ static void test_add(void)
             Int128 b = expand(tests[j]);
             Int128 r = expand(tests[i] + tests[j]);
             Int128 s = int128_add(a, b);
-            g_assert_cmpuint(r.lo, ==, s.lo);
-            g_assert_cmpuint(r.hi, ==, s.hi);
+            g_assert_cmpuint(int128_getlo(r), ==, int128_getlo(s));
+            g_assert_cmpuint(int128_gethi(r), ==, int128_gethi(s));
         }
     }
 }
@@ -86,8 +86,8 @@ static void test_sub(void)
             Int128 b = expand(tests[j]);
             Int128 r = expand(tests[i] - tests[j]);
             Int128 s = int128_sub(a, b);
-            g_assert_cmpuint(r.lo, ==, s.lo);
-            g_assert_cmpuint(r.hi, ==, s.hi);
+            g_assert_cmpuint(int128_getlo(r), ==, int128_getlo(s));
+            g_assert_cmpuint(int128_gethi(r), ==, int128_gethi(s));
         }
     }
 }
@@ -100,8 +100,8 @@ static void test_neg(void)
         Int128 a = expand(tests[i]);
         Int128 r = expand(-tests[i]);
         Int128 s = int128_neg(a);
-        g_assert_cmpuint(r.lo, ==, s.lo);
-        g_assert_cmpuint(r.hi, ==, s.hi);
+        g_assert_cmpuint(int128_getlo(r), ==, int128_getlo(s));
+        g_assert_cmpuint(int128_gethi(r), ==, int128_gethi(s));
     }
 }
 
@@ -180,8 +180,8 @@ test_rshift_one(uint32_t x, int n, uint64_t h, uint64_t l)
 {
     Int128 a = expand(x);
     Int128 r = int128_rshift(a, n);
-    g_assert_cmpuint(r.lo, ==, l);
-    g_assert_cmpuint(r.hi, ==, h);
+    g_assert_cmpuint(int128_getlo(r), ==, l);
+    g_assert_cmpuint(int128_gethi(r), ==, h);
 }
 
 static void test_rshift(void)

From 1edaeee0955fba7d834b7c8f4e372e7eae030745 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Wed, 29 Jun 2016 16:57:26 -0700
Subject: [PATCH 07/37] int128: Add int128_make128
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allows Int128 to be used more generally, rather than having to
begin with 64-bit inputs and accumulate.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/qemu/int128.h | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index 64a7aca60442..d4c6e44331f8 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -10,6 +10,11 @@ static inline Int128 int128_make64(uint64_t a)
     return a;
 }
 
+static inline Int128 int128_make128(uint64_t lo, uint64_t hi)
+{
+    return (__uint128_t)hi << 64 | lo;
+}
+
 static inline uint64_t int128_get64(Int128 a)
 {
     uint64_t r = a;
@@ -146,6 +151,11 @@ static inline Int128 int128_make64(uint64_t a)
     return (Int128) { a, 0 };
 }
 
+static inline Int128 int128_make128(uint64_t lo, uint64_t hi)
+{
+    return (Int128) { lo, hi };
+}
+
 static inline uint64_t int128_get64(Int128 a)
 {
     assert(!a.hi);
@@ -195,9 +205,9 @@ static inline Int128 int128_rshift(Int128 a, int n)
     }
     h = a.hi >> (n & 63);
     if (n >= 64) {
-        return (Int128) { h, h >> 63 };
+        return int128_make128(h, h >> 63);
     } else {
-        return (Int128) { (a.lo >> n) | ((uint64_t)a.hi << (64 - n)), h };
+        return int128_make128((a.lo >> n) | ((uint64_t)a.hi << (64 - n)), h);
     }
 }
 
@@ -211,18 +221,18 @@ static inline Int128 int128_add(Int128 a, Int128 b)
      *
      * So the carry is lo < a.lo.
      */
-    return (Int128) { lo, (uint64_t)a.hi + b.hi + (lo < a.lo) };
+    return int128_make128(lo, (uint64_t)a.hi + b.hi + (lo < a.lo));
 }
 
 static inline Int128 int128_neg(Int128 a)
 {
     uint64_t lo = -a.lo;
-    return (Int128) { lo, ~(uint64_t)a.hi + !lo };
+    return int128_make128(lo, ~(uint64_t)a.hi + !lo);
 }
 
 static inline Int128 int128_sub(Int128 a, Int128 b)
 {
-    return (Int128){ a.lo - b.lo, (uint64_t)a.hi - b.hi - (a.lo < b.lo) };
+    return int128_make128(a.lo - b.lo, (uint64_t)a.hi - b.hi - (a.lo < b.lo));
 }
 
 static inline bool int128_nonneg(Int128 a)

From fdbc2b5722f6092e47181a947c90fd4bdcc1c121 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Wed, 29 Jun 2016 22:12:55 -0700
Subject: [PATCH 08/37] tcg: Add EXCP_ATOMIC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When we cannot emulate an atomic operation within a parallel
context, this exception allows us to stop the world and try
again in a serial context.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 cpu-exec-common.c       |  6 ++++++
 cpu-exec.c              | 30 +++++++++++++++++++++++++++
 cpus.c                  |  2 ++
 include/exec/cpu-all.h  |  1 +
 include/exec/exec-all.h |  1 +
 include/qemu-common.h   |  1 +
 linux-user/main.c       | 45 +++++++++++++++++++++++++++++++++++++++++
 tcg/tcg.h               |  1 +
 translate-all.c         |  1 +
 9 files changed, 88 insertions(+)

diff --git a/cpu-exec-common.c b/cpu-exec-common.c
index 0cb4ae60eff9..767d9c6f0c41 100644
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -77,3 +77,9 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
     }
     siglongjmp(cpu->jmp_env, 1);
 }
+
+void cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc)
+{
+    cpu->exception_index = EXCP_ATOMIC;
+    cpu_loop_exit_restore(cpu, pc);
+}
diff --git a/cpu-exec.c b/cpu-exec.c
index e114fcdf2962..c9997935ddd2 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -222,6 +222,36 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
 }
 #endif
 
+static void cpu_exec_step(CPUState *cpu)
+{
+    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+    TranslationBlock *tb;
+    target_ulong cs_base, pc;
+    uint32_t flags;
+
+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    tb = tb_gen_code(cpu, pc, cs_base, flags,
+                     1 | CF_NOCACHE | CF_IGNORE_ICOUNT);
+    tb->orig_tb = NULL;
+    /* execute the generated code */
+    trace_exec_tb_nocache(tb, pc);
+    cpu_tb_exec(cpu, tb);
+    tb_phys_invalidate(tb, -1);
+    tb_free(tb);
+}
+
+void cpu_exec_step_atomic(CPUState *cpu)
+{
+    start_exclusive();
+
+    /* Since we got here, we know that parallel_cpus must be true.  */
+    parallel_cpus = false;
+    cpu_exec_step(cpu);
+    parallel_cpus = true;
+
+    end_exclusive();
+}
+
 struct tb_desc {
     target_ulong pc;
     target_ulong cs_base;
diff --git a/cpus.c b/cpus.c
index 31204bb4b3dd..cfd5cdc0e34d 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1497,6 +1497,8 @@ static void tcg_exec_all(void)
             if (r == EXCP_DEBUG) {
                 cpu_handle_guest_debug(cpu);
                 break;
+            } else if (r == EXCP_ATOMIC) {
+                cpu_exec_step_atomic(cpu);
             }
         } else if (cpu->stop || cpu->stopped) {
             if (cpu->unplug) {
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index 861260d3dbc6..e9004e5798f4 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -31,6 +31,7 @@
 #define EXCP_DEBUG      0x10002 /* cpu stopped after a breakpoint or singlestep */
 #define EXCP_HALTED     0x10003 /* cpu is halted (waiting for external event) */
 #define EXCP_YIELD      0x10004 /* cpu wants to yield timeslice to another */
+#define EXCP_ATOMIC     0x10005 /* stop-the-world and emulate atomic */
 
 /* some important defines:
  *
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 9797d556e838..cb624e4acc09 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -59,6 +59,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 
 void QEMU_NORETURN cpu_loop_exit(CPUState *cpu);
 void QEMU_NORETURN cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc);
+void QEMU_NORETURN cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc);
 
 #if !defined(CONFIG_USER_ONLY)
 void cpu_reloading_memory_map(void);
diff --git a/include/qemu-common.h b/include/qemu-common.h
index 7e6e4feb4b2b..1430390eb6db 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -80,6 +80,7 @@ void tcg_exec_init(unsigned long tb_size);
 bool tcg_enabled(void);
 
 void cpu_exec_init_all(void);
+void cpu_exec_step_atomic(CPUState *cpu);
 
 /**
  * set_preferred_target_page_bits:
diff --git a/linux-user/main.c b/linux-user/main.c
index 54970bc4d906..f0c47d49ab85 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -354,6 +354,9 @@ void cpu_loop(CPUX86State *env)
                   }
             }
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             pc = env->segs[R_CS].base + env->eip;
             EXCP_DUMP(env, "qemu: 0x%08lx: unhandled CPU exception 0x%x - aborting\n",
@@ -851,6 +854,9 @@ void cpu_loop(CPUARMState *env)
         case EXCP_YIELD:
             /* nothing to do here for user-mode, just resume guest code */
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
         error:
             EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", trapnr);
@@ -1051,6 +1057,9 @@ void cpu_loop(CPUARMState *env)
         case EXCP_YIELD:
             /* nothing to do here for user-mode, just resume guest code */
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", trapnr);
             abort();
@@ -1142,6 +1151,9 @@ void cpu_loop(CPUUniCore32State *env)
                 }
             }
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             goto error;
         }
@@ -1415,6 +1427,9 @@ void cpu_loop (CPUSPARCState *env)
                   }
             }
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             printf ("Unhandled trap: 0x%x\n", trapnr);
             cpu_dump_state(cs, stderr, fprintf, 0);
@@ -1954,6 +1969,9 @@ void cpu_loop(CPUPPCState *env)
         case EXCP_INTERRUPT:
             /* just indicate that signals should be handled asap */
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             cpu_abort(cs, "Unknown exception 0x%x. Aborting\n", trapnr);
             break;
@@ -2649,6 +2667,9 @@ void cpu_loop(CPUMIPSState *env)
                 }
             }
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
 error:
             EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", trapnr);
@@ -2736,6 +2757,9 @@ void cpu_loop(CPUOpenRISCState *env)
         case EXCP_NR:
             qemu_log_mask(CPU_LOG_INT, "\nNR\n");
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             EXCP_DUMP(env, "\nqemu: unhandled CPU exception %#x - aborting\n",
                      trapnr);
@@ -2812,6 +2836,9 @@ void cpu_loop(CPUSH4State *env)
             queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
 	    break;
 
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             printf ("Unhandled trap: 0x%x\n", trapnr);
             cpu_dump_state(cs, stderr, fprintf, 0);
@@ -2879,6 +2906,9 @@ void cpu_loop(CPUCRISState *env)
                   }
             }
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             printf ("Unhandled trap: 0x%x\n", trapnr);
             cpu_dump_state(cs, stderr, fprintf, 0);
@@ -2995,6 +3025,9 @@ void cpu_loop(CPUMBState *env)
                   }
             }
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             printf ("Unhandled trap: 0x%x\n", trapnr);
             cpu_dump_state(cs, stderr, fprintf, 0);
@@ -3098,6 +3131,9 @@ void cpu_loop(CPUM68KState *env)
                   }
             }
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", trapnr);
             abort();
@@ -3334,6 +3370,9 @@ void cpu_loop(CPUAlphaState *env)
         case EXCP_INTERRUPT:
             /* Just indicate that signals should be handled asap.  */
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             printf ("Unhandled trap: 0x%x\n", trapnr);
             cpu_dump_state(cs, stderr, fprintf, 0);
@@ -3463,6 +3502,9 @@ void cpu_loop(CPUS390XState *env)
             queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
             break;
 
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             fprintf(stderr, "Unhandled trap: 0x%x\n", trapnr);
             cpu_dump_state(cs, stderr, fprintf, 0);
@@ -3717,6 +3759,9 @@ void cpu_loop(CPUTLGState *env)
         case TILEGX_EXCP_REG_UDN_ACCESS:
             gen_sigill_reg(env);
             break;
+        case EXCP_ATOMIC:
+            cpu_exec_step_atomic(cs);
+            break;
         default:
             fprintf(stderr, "trapnr is %d[0x%x].\n", trapnr, trapnr);
             g_assert_not_reached();
diff --git a/tcg/tcg.h b/tcg/tcg.h
index c9949aa79b64..3b211560c879 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -704,6 +704,7 @@ struct TCGContext {
 };
 
 extern TCGContext tcg_ctx;
+extern bool parallel_cpus;
 
 static inline void tcg_set_insn_param(int op_idx, int arg, TCGArg v)
 {
diff --git a/translate-all.c b/translate-all.c
index 86b45a19c51b..76fc18c98f58 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -118,6 +118,7 @@ static void *l1_map[V_L1_MAX_SIZE];
 
 /* code generation context */
 TCGContext tcg_ctx;
+bool parallel_cpus;
 
 /* translation block context */
 #ifdef CONFIG_USER_ONLY

From b67cb68ba59fd36076e5961139cb3c953c69bed0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Wed, 5 Oct 2016 11:13:04 -0700
Subject: [PATCH 09/37] linux-user: enable parallel code generation on clone
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The variable parallel_cpus controls the generation of thread aware
atomic code.  We only need to set it once we clone our first thread.
At this point any existing translations need to be thrown away.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 linux-user/syscall.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index db697c0bf38a..7b77503f94f4 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -6164,6 +6164,14 @@ static int do_fork(CPUArchState *env, unsigned int flags, abi_ulong newsp,
         sigfillset(&sigmask);
         sigprocmask(SIG_BLOCK, &sigmask, &info.sigmask);
 
+        /* If this is our first additional thread, we need to ensure we
+         * generate code for parallel execution and flush old translations.
+         */
+        if (!parallel_cpus) {
+            parallel_cpus = true;
+            tb_flush(cpu);
+        }
+
         ret = pthread_create(&info.thread, &attr, clone_func, &info);
         /* TODO: Free new CPU state if thread creation failed.  */
 

From dea2198201b3e0151d75b42774c51cf2ffe2ca4b Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 8 Jul 2016 18:14:28 -0700
Subject: [PATCH 10/37] cputlb: Replace SHIFT with DATA_SIZE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 cputlb.c           | 16 ++++++++--------
 softmmu_template.h |  7 ++-----
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/cputlb.c b/cputlb.c
index 3c99c34ac8bb..5575b733f5bd 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -529,16 +529,16 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
 
 #define MMUSUFFIX _mmu
 
-#define SHIFT 0
+#define DATA_SIZE 1
 #include "softmmu_template.h"
 
-#define SHIFT 1
+#define DATA_SIZE 2
 #include "softmmu_template.h"
 
-#define SHIFT 2
+#define DATA_SIZE 4
 #include "softmmu_template.h"
 
-#define SHIFT 3
+#define DATA_SIZE 8
 #include "softmmu_template.h"
 #undef MMUSUFFIX
 
@@ -547,14 +547,14 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
 #define GETPC() ((uintptr_t)0)
 #define SOFTMMU_CODE_ACCESS
 
-#define SHIFT 0
+#define DATA_SIZE 1
 #include "softmmu_template.h"
 
-#define SHIFT 1
+#define DATA_SIZE 2
 #include "softmmu_template.h"
 
-#define SHIFT 2
+#define DATA_SIZE 4
 #include "softmmu_template.h"
 
-#define SHIFT 3
+#define DATA_SIZE 8
 #include "softmmu_template.h"
diff --git a/softmmu_template.h b/softmmu_template.h
index 27ed2694df24..f9c51fec1ccc 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -25,8 +25,6 @@
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
 
-#define DATA_SIZE (1 << SHIFT)
-
 #if DATA_SIZE == 8
 #define SUFFIX q
 #define LSUFFIX q
@@ -134,7 +132,7 @@ static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
     }
 
     cpu->mem_io_vaddr = addr;
-    memory_region_dispatch_read(mr, physaddr, &val, 1 << SHIFT,
+    memory_region_dispatch_read(mr, physaddr, &val, DATA_SIZE,
                                 iotlbentry->attrs);
     return val;
 }
@@ -311,7 +309,7 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
 
     cpu->mem_io_vaddr = addr;
     cpu->mem_io_pc = retaddr;
-    memory_region_dispatch_write(mr, physaddr, val, 1 << SHIFT,
+    memory_region_dispatch_write(mr, physaddr, val, DATA_SIZE,
                                  iotlbentry->attrs);
 }
 
@@ -492,7 +490,6 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
 #endif /* !defined(SOFTMMU_CODE_ACCESS) */
 
 #undef READ_ACCESS_TYPE
-#undef SHIFT
 #undef DATA_TYPE
 #undef SUFFIX
 #undef LSUFFIX

From 3b08f0a92545ba06fbdeaae929a5172480300c33 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 8 Jul 2016 18:22:26 -0700
Subject: [PATCH 11/37] cputlb: Move probe_write out of softmmu_template.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 cputlb.c           | 21 +++++++++++++++++++++
 softmmu_template.h | 23 -----------------------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/cputlb.c b/cputlb.c
index 5575b733f5bd..0c9b77b0a02d 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -527,6 +527,27 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
   victim_tlb_hit(env, mmu_idx, index, offsetof(CPUTLBEntry, TY), \
                  (ADDR) & TARGET_PAGE_MASK)
 
+/* Probe for whether the specified guest write access is permitted.
+ * If it is not permitted then an exception will be taken in the same
+ * way as if this were a real write access (and we will not return).
+ * Otherwise the function will return, and there will be a valid
+ * entry in the TLB for this access.
+ */
+void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
+                 uintptr_t retaddr)
+{
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+
+    if ((addr & TARGET_PAGE_MASK)
+        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        /* TLB entry is for a different page */
+        if (!VICTIM_TLB_HIT(addr_write, addr)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+    }
+}
+
 #define MMUSUFFIX _mmu
 
 #define DATA_SIZE 1
diff --git a/softmmu_template.h b/softmmu_template.h
index f9c51fec1ccc..538cff5719cb 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -464,29 +464,6 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
     glue(glue(st, SUFFIX), _be_p)((uint8_t *)haddr, val);
 }
 #endif /* DATA_SIZE > 1 */
-
-#if DATA_SIZE == 1
-/* Probe for whether the specified guest write access is permitted.
- * If it is not permitted then an exception will be taken in the same
- * way as if this were a real write access (and we will not return).
- * Otherwise the function will return, and there will be a valid
- * entry in the TLB for this access.
- */
-void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
-                 uintptr_t retaddr)
-{
-    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
-
-    if ((addr & TARGET_PAGE_MASK)
-        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-        /* TLB entry is for a different page */
-        if (!VICTIM_TLB_HIT(addr_write, addr)) {
-            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
-        }
-    }
-}
-#endif
 #endif /* !defined(SOFTMMU_CODE_ACCESS) */
 
 #undef READ_ACCESS_TYPE

From 40978428853e2f7b4597ab2a9ffeb187333802dc Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 8 Jul 2016 18:24:55 -0700
Subject: [PATCH 12/37] cputlb: Remove includes from softmmu_template.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We already include exec/address-spaces.h and exec/memory.h in
cputlb.c; the include of qemu/timer.h appears to be a fossil.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 softmmu_template.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/softmmu_template.h b/softmmu_template.h
index 538cff5719cb..b9532a4b2428 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -21,10 +21,6 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
-#include "qemu/timer.h"
-#include "exec/address-spaces.h"
-#include "exec/memory.h"
-
 #if DATA_SIZE == 8
 #define SUFFIX q
 #define LSUFFIX q

From 82a45b96a203a7403427183f1afd3d295222ff7d Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 8 Jul 2016 18:51:28 -0700
Subject: [PATCH 13/37] cputlb: Move most of iotlb code out of line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Saves 2k code size off of a cold path.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 cputlb.c           | 37 +++++++++++++++++++++++++++++++++
 softmmu_template.h | 52 +++++++++-------------------------------------
 2 files changed, 47 insertions(+), 42 deletions(-)

diff --git a/cputlb.c b/cputlb.c
index 0c9b77b0a02d..1bee47d5cdce 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -498,6 +498,43 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
     return qemu_ram_addr_from_host_nofail(p);
 }
 
+static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+                         target_ulong addr, uintptr_t retaddr, int size)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    hwaddr physaddr = iotlbentry->addr;
+    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+    uint64_t val;
+
+    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+    cpu->mem_io_pc = retaddr;
+    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+        cpu_io_recompile(cpu, retaddr);
+    }
+
+    cpu->mem_io_vaddr = addr;
+    memory_region_dispatch_read(mr, physaddr, &val, size, iotlbentry->attrs);
+    return val;
+}
+
+static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+                      uint64_t val, target_ulong addr,
+                      uintptr_t retaddr, int size)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    hwaddr physaddr = iotlbentry->addr;
+    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+
+    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+        cpu_io_recompile(cpu, retaddr);
+    }
+
+    cpu->mem_io_vaddr = addr;
+    cpu->mem_io_pc = retaddr;
+    memory_region_dispatch_write(mr, physaddr, val, size, iotlbentry->attrs);
+}
+
 /* Return true if ADDR is present in the victim tlb, and has been copied
    back to the main tlb.  */
 static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
diff --git a/softmmu_template.h b/softmmu_template.h
index b9532a4b2428..035ffc8be6bf 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -112,25 +112,12 @@
 
 #ifndef SOFTMMU_CODE_ACCESS
 static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
-                                              CPUIOTLBEntry *iotlbentry,
+                                              size_t mmu_idx, size_t index,
                                               target_ulong addr,
                                               uintptr_t retaddr)
 {
-    uint64_t val;
-    CPUState *cpu = ENV_GET_CPU(env);
-    hwaddr physaddr = iotlbentry->addr;
-    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
-
-    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
-    cpu->mem_io_pc = retaddr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
-        cpu_io_recompile(cpu, retaddr);
-    }
-
-    cpu->mem_io_vaddr = addr;
-    memory_region_dispatch_read(mr, physaddr, &val, DATA_SIZE,
-                                iotlbentry->attrs);
-    return val;
+    CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
+    return io_readx(env, iotlbentry, addr, retaddr, DATA_SIZE);
 }
 #endif
 
@@ -161,15 +148,13 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
 
     /* Handle an IO access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
         if ((addr & (DATA_SIZE - 1)) != 0) {
             goto do_unaligned_access;
         }
-        iotlbentry = &env->iotlb[mmu_idx][index];
 
         /* ??? Note that the io helpers always read data in the target
            byte ordering.  We should push the LE/BE request down into io.  */
-        res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
+        res = glue(io_read, SUFFIX)(env, mmu_idx, index, addr, retaddr);
         res = TGT_LE(res);
         return res;
     }
@@ -230,15 +215,13 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
 
     /* Handle an IO access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
         if ((addr & (DATA_SIZE - 1)) != 0) {
             goto do_unaligned_access;
         }
-        iotlbentry = &env->iotlb[mmu_idx][index];
 
         /* ??? Note that the io helpers always read data in the target
            byte ordering.  We should push the LE/BE request down into io.  */
-        res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
+        res = glue(io_read, SUFFIX)(env, mmu_idx, index, addr, retaddr);
         res = TGT_BE(res);
         return res;
     }
@@ -289,24 +272,13 @@ WORD_TYPE helper_be_lds_name(CPUArchState *env, target_ulong addr,
 #endif
 
 static inline void glue(io_write, SUFFIX)(CPUArchState *env,
-                                          CPUIOTLBEntry *iotlbentry,
+                                          size_t mmu_idx, size_t index,
                                           DATA_TYPE val,
                                           target_ulong addr,
                                           uintptr_t retaddr)
 {
-    CPUState *cpu = ENV_GET_CPU(env);
-    hwaddr physaddr = iotlbentry->addr;
-    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
-
-    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
-        cpu_io_recompile(cpu, retaddr);
-    }
-
-    cpu->mem_io_vaddr = addr;
-    cpu->mem_io_pc = retaddr;
-    memory_region_dispatch_write(mr, physaddr, val, DATA_SIZE,
-                                 iotlbentry->attrs);
+    CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
+    return io_writex(env, iotlbentry, val, addr, retaddr, DATA_SIZE);
 }
 
 void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
@@ -334,16 +306,14 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 
     /* Handle an IO access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
         if ((addr & (DATA_SIZE - 1)) != 0) {
             goto do_unaligned_access;
         }
-        iotlbentry = &env->iotlb[mmu_idx][index];
 
         /* ??? Note that the io helpers always read data in the target
            byte ordering.  We should push the LE/BE request down into io.  */
         val = TGT_LE(val);
-        glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
+        glue(io_write, SUFFIX)(env, mmu_idx, index, val, addr, retaddr);
         return;
     }
 
@@ -412,16 +382,14 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 
     /* Handle an IO access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
         if ((addr & (DATA_SIZE - 1)) != 0) {
             goto do_unaligned_access;
         }
-        iotlbentry = &env->iotlb[mmu_idx][index];
 
         /* ??? Note that the io helpers always read data in the target
            byte ordering.  We should push the LE/BE request down into io.  */
         val = TGT_BE(val);
-        glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
+        glue(io_write, SUFFIX)(env, mmu_idx, index, val, addr, retaddr);
         return;
     }
 

From c86c6e4c80fee4d9423bedb10ba9e9c4aa68f861 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 8 Jul 2016 19:02:33 -0700
Subject: [PATCH 14/37] cputlb: Tidy some macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TGT_LE and TGT_BE are not size dependent and do not need to be
redefined.  The others are no longer used at all.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 cputlb.c           |  8 ++++++++
 softmmu_template.h | 22 ----------------------
 2 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/cputlb.c b/cputlb.c
index 1bee47d5cdce..82cf46ea6492 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -585,6 +585,14 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
     }
 }
 
+#ifdef TARGET_WORDS_BIGENDIAN
+# define TGT_BE(X)  (X)
+# define TGT_LE(X)  BSWAP(X)
+#else
+# define TGT_BE(X)  BSWAP(X)
+# define TGT_LE(X)  (X)
+#endif
+
 #define MMUSUFFIX _mmu
 
 #define DATA_SIZE 1
diff --git a/softmmu_template.h b/softmmu_template.h
index 035ffc8be6bf..4a2b6653f64e 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -78,14 +78,6 @@
 # define BSWAP(X)  (X)
 #endif
 
-#ifdef TARGET_WORDS_BIGENDIAN
-# define TGT_BE(X)  (X)
-# define TGT_LE(X)  BSWAP(X)
-#else
-# define TGT_BE(X)  BSWAP(X)
-# define TGT_LE(X)  (X)
-#endif
-
 #if DATA_SIZE == 1
 # define helper_le_ld_name  glue(glue(helper_ret_ld, USUFFIX), MMUSUFFIX)
 # define helper_be_ld_name  helper_le_ld_name
@@ -102,14 +94,6 @@
 # define helper_be_st_name  glue(glue(helper_be_st, SUFFIX), MMUSUFFIX)
 #endif
 
-#ifdef TARGET_WORDS_BIGENDIAN
-# define helper_te_ld_name  helper_be_ld_name
-# define helper_te_st_name  helper_be_st_name
-#else
-# define helper_te_ld_name  helper_le_ld_name
-# define helper_te_st_name  helper_le_st_name
-#endif
-
 #ifndef SOFTMMU_CODE_ACCESS
 static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
                                               size_t mmu_idx, size_t index,
@@ -441,15 +425,9 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 #undef USUFFIX
 #undef SSUFFIX
 #undef BSWAP
-#undef TGT_BE
-#undef TGT_LE
-#undef CPU_BE
-#undef CPU_LE
 #undef helper_le_ld_name
 #undef helper_be_ld_name
 #undef helper_le_lds_name
 #undef helper_be_lds_name
 #undef helper_le_st_name
 #undef helper_be_st_name
-#undef helper_te_ld_name
-#undef helper_te_st_name

From c482cb117cc418115ca9c6d21a7a2315414c0a40 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Tue, 28 Jun 2016 11:37:27 -0700
Subject: [PATCH 15/37] tcg: Add atomic helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add all of cmpxchg, op_fetch, fetch_op, and xchg.
Handle both endian-ness, and sizes up to 8.
Handle expanding non-atomically, when emulating in serial.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 Makefile.objs     |   2 +-
 Makefile.target   |   1 +
 atomic_template.h | 177 +++++++++++++++++++++++++
 cputlb.c          | 112 +++++++++++++++-
 tcg-runtime.c     |  49 +++++--
 tcg/tcg-op.c      | 328 ++++++++++++++++++++++++++++++++++++++++++++++
 tcg/tcg-op.h      |  44 +++++++
 tcg/tcg-runtime.h |  75 +++++++++++
 tcg/tcg.h         |  53 ++++++++
 9 files changed, 826 insertions(+), 15 deletions(-)
 create mode 100644 atomic_template.h

diff --git a/Makefile.objs b/Makefile.objs
index 82e8a1e10a37..06f74b8b995d 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -89,7 +89,7 @@ endif
 
 #######################################################################
 # Target-independent parts used in system and user emulation
-common-obj-y += tcg-runtime.o cpus-common.o
+common-obj-y += cpus-common.o
 common-obj-y += hw/
 common-obj-y += qom/
 common-obj-y += disas/
diff --git a/Makefile.target b/Makefile.target
index 2c4609122588..7a5080e94a29 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -94,6 +94,7 @@ obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
 obj-y += target-$(TARGET_BASE_ARCH)/
 obj-y += disas.o
+obj-y += tcg-runtime.o
 obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o
 obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o
 
diff --git a/atomic_template.h b/atomic_template.h
new file mode 100644
index 000000000000..cf837254034b
--- /dev/null
+++ b/atomic_template.h
@@ -0,0 +1,177 @@
+/*
+ * Atomic helper templates
+ * Included from tcg-runtime.c and cputlb.c.
+ *
+ * Copyright (c) 2016 Red Hat, Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if DATA_SIZE == 8
+# define SUFFIX     q
+# define DATA_TYPE  uint64_t
+# define BSWAP      bswap64
+#elif DATA_SIZE == 4
+# define SUFFIX     l
+# define DATA_TYPE  uint32_t
+# define BSWAP      bswap32
+#elif DATA_SIZE == 2
+# define SUFFIX     w
+# define DATA_TYPE  uint16_t
+# define BSWAP      bswap16
+#elif DATA_SIZE == 1
+# define SUFFIX     b
+# define DATA_TYPE  uint8_t
+# define BSWAP
+#else
+# error unsupported data size
+#endif
+
+#if DATA_SIZE >= 4
+# define ABI_TYPE  DATA_TYPE
+#else
+# define ABI_TYPE  uint32_t
+#endif
+
+/* Define host-endian atomic operations.  Note that END is used within
+   the ATOMIC_NAME macro, and redefined below.  */
+#if DATA_SIZE == 1
+# define END
+#elif defined(HOST_WORDS_BIGENDIAN)
+# define END  _be
+#else
+# define END  _le
+#endif
+
+ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
+                              ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return atomic_cmpxchg__nocheck(haddr, cmpv, newv);
+}
+
+ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
+                           ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return atomic_xchg__nocheck(haddr, val);
+}
+
+#define GEN_ATOMIC_HELPER(X)                                        \
+ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
+                 ABI_TYPE val EXTRA_ARGS)                           \
+{                                                                   \
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
+    return atomic_##X(haddr, val);                                  \
+}                                                                   \
+
+GEN_ATOMIC_HELPER(fetch_add)
+GEN_ATOMIC_HELPER(fetch_and)
+GEN_ATOMIC_HELPER(fetch_or)
+GEN_ATOMIC_HELPER(fetch_xor)
+GEN_ATOMIC_HELPER(add_fetch)
+GEN_ATOMIC_HELPER(and_fetch)
+GEN_ATOMIC_HELPER(or_fetch)
+GEN_ATOMIC_HELPER(xor_fetch)
+
+#undef GEN_ATOMIC_HELPER
+#undef END
+
+#if DATA_SIZE > 1
+
+/* Define reverse-host-endian atomic operations.  Note that END is used
+   within the ATOMIC_NAME macro.  */
+#ifdef HOST_WORDS_BIGENDIAN
+# define END  _le
+#else
+# define END  _be
+#endif
+
+ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
+                              ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return BSWAP(atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv)));
+}
+
+ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
+                           ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    return BSWAP(atomic_xchg__nocheck(haddr, BSWAP(val)));
+}
+
+#define GEN_ATOMIC_HELPER(X)                                        \
+ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
+                 ABI_TYPE val EXTRA_ARGS)                           \
+{                                                                   \
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
+    return BSWAP(atomic_##X(haddr, BSWAP(val)));                    \
+}
+
+GEN_ATOMIC_HELPER(fetch_and)
+GEN_ATOMIC_HELPER(fetch_or)
+GEN_ATOMIC_HELPER(fetch_xor)
+GEN_ATOMIC_HELPER(and_fetch)
+GEN_ATOMIC_HELPER(or_fetch)
+GEN_ATOMIC_HELPER(xor_fetch)
+
+#undef GEN_ATOMIC_HELPER
+
+/* Note that for addition, we need to use a separate cmpxchg loop instead
+   of bswaps for the reverse-host-endian helpers.  */
+ABI_TYPE ATOMIC_NAME(fetch_add)(CPUArchState *env, target_ulong addr,
+                         ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE ldo, ldn, ret, sto;
+
+    ldo = atomic_read__nocheck(haddr);
+    while (1) {
+        ret = BSWAP(ldo);
+        sto = BSWAP(ret + val);
+        ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+
+ABI_TYPE ATOMIC_NAME(add_fetch)(CPUArchState *env, target_ulong addr,
+                         ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE ldo, ldn, ret, sto;
+
+    ldo = atomic_read__nocheck(haddr);
+    while (1) {
+        ret = BSWAP(ldo) + val;
+        sto = BSWAP(ret);
+        ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+
+#undef END
+#endif /* DATA_SIZE > 1 */
+
+#undef BSWAP
+#undef ABI_TYPE
+#undef DATA_TYPE
+#undef SUFFIX
+#undef DATA_SIZE
diff --git a/cputlb.c b/cputlb.c
index 82cf46ea6492..4f2c5002f6d5 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -23,15 +23,15 @@
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
 #include "exec/cpu_ldst.h"
-
 #include "exec/cputlb.h"
-
 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "qemu/error-report.h"
 #include "exec/log.h"
+#include "exec/helper-proto.h"
+#include "qemu/atomic.h"
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
@@ -585,6 +585,69 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
     }
 }
 
+/* Probe for a read-modify-write atomic operation.  Do not allow unaligned
+ * operations, or io operations to proceed.  Return the host address.  */
+static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+                               TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    size_t mmu_idx = get_mmuidx(oi);
+    size_t index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    CPUTLBEntry *tlbe = &env->tlb_table[mmu_idx][index];
+    target_ulong tlb_addr = tlbe->addr_write;
+    TCGMemOp mop = get_memop(oi);
+    int a_bits = get_alignment_bits(mop);
+    int s_bits = mop & MO_SIZE;
+
+    /* Adjust the given return address.  */
+    retaddr -= GETPC_ADJ;
+
+    /* Enforce guest required alignment.  */
+    if (unlikely(a_bits > 0 && (addr & ((1 << a_bits) - 1)))) {
+        /* ??? Maybe indicate atomic op to cpu_unaligned_access */
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+
+    /* Enforce qemu required alignment.  */
+    if (unlikely(addr & ((1 << s_bits) - 1))) {
+        /* We get here if guest alignment was not requested,
+           or was not enforced by cpu_unaligned_access above.
+           We might widen the access and emulate, but for now
+           mark an exception and exit the cpu loop.  */
+        goto stop_the_world;
+    }
+
+    /* Check TLB entry and enforce page permissions.  */
+    if ((addr & TARGET_PAGE_MASK)
+        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        if (!VICTIM_TLB_HIT(addr_write, addr)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+        tlb_addr = tlbe->addr_write;
+    }
+
+    /* Notice an IO access, or a notdirty page.  */
+    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+        /* There's really nothing that can be done to
+           support this apart from stop-the-world.  */
+        goto stop_the_world;
+    }
+
+    /* Let the guest notice RMW on a write-only page.  */
+    if (unlikely(tlbe->addr_read != tlb_addr)) {
+        tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_LOAD, mmu_idx, retaddr);
+        /* Since we don't support reads and writes to different addresses,
+           and we do have the proper page loaded for write, this shouldn't
+           ever return.  But just in case, handle via stop-the-world.  */
+        goto stop_the_world;
+    }
+
+    return (void *)((uintptr_t)addr + tlbe->addend);
+
+ stop_the_world:
+    cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
+}
+
 #ifdef TARGET_WORDS_BIGENDIAN
 # define TGT_BE(X)  (X)
 # define TGT_LE(X)  BSWAP(X)
@@ -606,8 +669,51 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
 
 #define DATA_SIZE 8
 #include "softmmu_template.h"
-#undef MMUSUFFIX
 
+/* First set of helpers allows passing in of OI and RETADDR.  This makes
+   them callable from other helpers.  */
+
+#define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
+#define ATOMIC_NAME(X) \
+    HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, retaddr)
+
+#define DATA_SIZE 1
+#include "atomic_template.h"
+
+#define DATA_SIZE 2
+#include "atomic_template.h"
+
+#define DATA_SIZE 4
+#include "atomic_template.h"
+
+#define DATA_SIZE 8
+#include "atomic_template.h"
+
+/* Second set of helpers are directly callable from TCG as helpers.  */
+
+#undef EXTRA_ARGS
+#undef ATOMIC_NAME
+#undef ATOMIC_MMU_LOOKUP
+#define EXTRA_ARGS         , TCGMemOpIdx oi
+#define ATOMIC_NAME(X)     HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC())
+
+#define DATA_SIZE 1
+#include "atomic_template.h"
+
+#define DATA_SIZE 2
+#include "atomic_template.h"
+
+#define DATA_SIZE 4
+#include "atomic_template.h"
+
+#define DATA_SIZE 8
+#include "atomic_template.h"
+
+/* Code access functions.  */
+
+#undef MMUSUFFIX
 #define MMUSUFFIX _cmmu
 #undef GETPC
 #define GETPC() ((uintptr_t)0)
diff --git a/tcg-runtime.c b/tcg-runtime.c
index ea2ad649cbb5..aa55d12e1273 100644
--- a/tcg-runtime.c
+++ b/tcg-runtime.c
@@ -23,17 +23,10 @@
  */
 #include "qemu/osdep.h"
 #include "qemu/host-utils.h"
-
-/* This file is compiled once, and thus we can't include the standard
-   "exec/helper-proto.h", which has includes that are target specific.  */
-
-#include "exec/helper-head.h"
-
-#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
-  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
-
-#include "tcg-runtime.h"
-
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "exec/cpu_ldst.h"
+#include "exec/exec-all.h"
 
 /* 32-bit helpers */
 
@@ -107,3 +100,37 @@ int64_t HELPER(mulsh_i64)(int64_t arg1, int64_t arg2)
     muls64(&l, &h, arg1, arg2);
     return h;
 }
+
+#ifndef CONFIG_SOFTMMU
+/* The softmmu versions of these helpers are in cputlb.c.  */
+
+/* Do not allow unaligned operations to proceed.  Return the host address.  */
+static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+                               int size, uintptr_t retaddr)
+{
+    /* Enforce qemu required alignment.  */
+    if (unlikely(addr & (size - 1))) {
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
+    }
+    return g2h(addr);
+}
+
+/* Macro to call the above, with local variables from the use context.  */
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, GETPC())
+
+#define ATOMIC_NAME(X)   HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
+#define EXTRA_ARGS
+
+#define DATA_SIZE 1
+#include "atomic_template.h"
+
+#define DATA_SIZE 2
+#include "atomic_template.h"
+
+#define DATA_SIZE 4
+#include "atomic_template.h"
+
+#define DATA_SIZE 8
+#include "atomic_template.h"
+
+#endif /* !CONFIG_SOFTMMU */
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 291d50bb7d5d..65e36637d029 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -1975,3 +1975,331 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
                                addr, trace_mem_get_info(memop, 1));
     gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx);
 }
+
+static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, TCGMemOp opc)
+{
+    switch (opc & MO_SSIZE) {
+    case MO_SB:
+        tcg_gen_ext8s_i32(ret, val);
+        break;
+    case MO_UB:
+        tcg_gen_ext8u_i32(ret, val);
+        break;
+    case MO_SW:
+        tcg_gen_ext16s_i32(ret, val);
+        break;
+    case MO_UW:
+        tcg_gen_ext16u_i32(ret, val);
+        break;
+    default:
+        tcg_gen_mov_i32(ret, val);
+        break;
+    }
+}
+
+static void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, TCGMemOp opc)
+{
+    switch (opc & MO_SSIZE) {
+    case MO_SB:
+        tcg_gen_ext8s_i64(ret, val);
+        break;
+    case MO_UB:
+        tcg_gen_ext8u_i64(ret, val);
+        break;
+    case MO_SW:
+        tcg_gen_ext16s_i64(ret, val);
+        break;
+    case MO_UW:
+        tcg_gen_ext16u_i64(ret, val);
+        break;
+    case MO_SL:
+        tcg_gen_ext32s_i64(ret, val);
+        break;
+    case MO_UL:
+        tcg_gen_ext32u_i64(ret, val);
+        break;
+    default:
+        tcg_gen_mov_i64(ret, val);
+        break;
+    }
+}
+
+#ifdef CONFIG_SOFTMMU
+typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv_env, TCGv,
+                                  TCGv_i32, TCGv_i32, TCGv_i32);
+typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv_env, TCGv,
+                                  TCGv_i64, TCGv_i64, TCGv_i32);
+typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv_env, TCGv,
+                                  TCGv_i32, TCGv_i32);
+typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv_env, TCGv,
+                                  TCGv_i64, TCGv_i32);
+#else
+typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv_env, TCGv, TCGv_i32, TCGv_i32);
+typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv_env, TCGv, TCGv_i64, TCGv_i64);
+typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv_env, TCGv, TCGv_i32);
+typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv_env, TCGv, TCGv_i64);
+#endif
+
+static void * const table_cmpxchg[16] = {
+    [MO_8] = gen_helper_atomic_cmpxchgb,
+    [MO_16 | MO_LE] = gen_helper_atomic_cmpxchgw_le,
+    [MO_16 | MO_BE] = gen_helper_atomic_cmpxchgw_be,
+    [MO_32 | MO_LE] = gen_helper_atomic_cmpxchgl_le,
+    [MO_32 | MO_BE] = gen_helper_atomic_cmpxchgl_be,
+    [MO_64 | MO_LE] = gen_helper_atomic_cmpxchgq_le,
+    [MO_64 | MO_BE] = gen_helper_atomic_cmpxchgq_be,
+};
+
+void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
+                                TCGv_i32 newv, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    if (!parallel_cpus) {
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        TCGv_i32 t2 = tcg_temp_new_i32();
+
+        tcg_gen_ext_i32(t2, cmpv, memop & MO_SIZE);
+
+        tcg_gen_qemu_ld_i32(t1, addr, idx, memop & ~MO_SIGN);
+        tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, t2, newv, t1);
+        tcg_gen_qemu_st_i32(t2, addr, idx, memop);
+        tcg_temp_free_i32(t2);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i32(retv, t1, memop);
+        } else {
+            tcg_gen_mov_i32(retv, t1);
+        }
+        tcg_temp_free_i32(t1);
+    } else {
+        gen_atomic_cx_i32 gen;
+
+        gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)];
+        tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_SOFTMMU
+        {
+            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
+            gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv, oi);
+            tcg_temp_free_i32(oi);
+        }
+#else
+        gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv);
+#endif
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i32(retv, retv, memop);
+        }
+    }
+}
+
+void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
+                                TCGv_i64 newv, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+    if (!parallel_cpus) {
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        TCGv_i64 t2 = tcg_temp_new_i64();
+
+        tcg_gen_ext_i64(t2, cmpv, memop & MO_SIZE);
+
+        tcg_gen_qemu_ld_i64(t1, addr, idx, memop & ~MO_SIGN);
+        tcg_gen_movcond_i64(TCG_COND_EQ, t2, t1, t2, newv, t1);
+        tcg_gen_qemu_st_i64(t2, addr, idx, memop);
+        tcg_temp_free_i64(t2);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i64(retv, t1, memop);
+        } else {
+            tcg_gen_mov_i64(retv, t1);
+        }
+        tcg_temp_free_i64(t1);
+    } else if ((memop & MO_SIZE) == MO_64) {
+        gen_atomic_cx_i64 gen;
+
+        gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)];
+        tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_SOFTMMU
+        {
+            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop, idx));
+            gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv, oi);
+            tcg_temp_free_i32(oi);
+        }
+#else
+        gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv);
+#endif
+    } else {
+        TCGv_i32 c32 = tcg_temp_new_i32();
+        TCGv_i32 n32 = tcg_temp_new_i32();
+        TCGv_i32 r32 = tcg_temp_new_i32();
+
+        tcg_gen_extrl_i64_i32(c32, cmpv);
+        tcg_gen_extrl_i64_i32(n32, newv);
+        tcg_gen_atomic_cmpxchg_i32(r32, addr, c32, n32, idx, memop & ~MO_SIGN);
+        tcg_temp_free_i32(c32);
+        tcg_temp_free_i32(n32);
+
+        tcg_gen_extu_i32_i64(retv, r32);
+        tcg_temp_free_i32(r32);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i64(retv, retv, memop);
+        }
+    }
+}
+
+static void do_nonatomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
+                                TCGArg idx, TCGMemOp memop, bool new_val,
+                                void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+    TCGv_i32 t1 = tcg_temp_new_i32();
+    TCGv_i32 t2 = tcg_temp_new_i32();
+
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    tcg_gen_qemu_ld_i32(t1, addr, idx, memop & ~MO_SIGN);
+    gen(t2, t1, val);
+    tcg_gen_qemu_st_i32(t2, addr, idx, memop);
+
+    tcg_gen_ext_i32(ret, (new_val ? t2 : t1), memop);
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t2);
+}
+
+static void do_atomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
+                             TCGArg idx, TCGMemOp memop, void * const table[])
+{
+    gen_atomic_op_i32 gen;
+
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    gen = table[memop & (MO_SIZE | MO_BSWAP)];
+    tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_SOFTMMU
+    {
+        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
+        gen(ret, tcg_ctx.tcg_env, addr, val, oi);
+        tcg_temp_free_i32(oi);
+    }
+#else
+    gen(ret, tcg_ctx.tcg_env, addr, val);
+#endif
+
+    if (memop & MO_SIGN) {
+        tcg_gen_ext_i32(ret, ret, memop);
+    }
+}
+
+static void do_nonatomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
+                                TCGArg idx, TCGMemOp memop, bool new_val,
+                                void (*gen)(TCGv_i64, TCGv_i64, TCGv_i64))
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+    tcg_gen_qemu_ld_i64(t1, addr, idx, memop & ~MO_SIGN);
+    gen(t2, t1, val);
+    tcg_gen_qemu_st_i64(t2, addr, idx, memop);
+
+    tcg_gen_ext_i64(ret, (new_val ? t2 : t1), memop);
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
+                             TCGArg idx, TCGMemOp memop, void * const table[])
+{
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+    if ((memop & MO_SIZE) == MO_64) {
+        gen_atomic_op_i64 gen;
+
+        gen = table[memop & (MO_SIZE | MO_BSWAP)];
+        tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_SOFTMMU
+        {
+            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
+            gen(ret, tcg_ctx.tcg_env, addr, val, oi);
+            tcg_temp_free_i32(oi);
+        }
+#else
+        gen(ret, tcg_ctx.tcg_env, addr, val);
+#endif
+    } else {
+        TCGv_i32 v32 = tcg_temp_new_i32();
+        TCGv_i32 r32 = tcg_temp_new_i32();
+
+        tcg_gen_extrl_i64_i32(v32, val);
+        do_atomic_op_i32(r32, addr, v32, idx, memop & ~MO_SIGN, table);
+        tcg_temp_free_i32(v32);
+
+        tcg_gen_extu_i32_i64(ret, r32);
+        tcg_temp_free_i32(r32);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i64(ret, ret, memop);
+        }
+    }
+}
+
+#define GEN_ATOMIC_HELPER(NAME, OP, NEW)                                \
+static void * const table_##NAME[16] = {                                \
+    [MO_8] = gen_helper_atomic_##NAME##b,                               \
+    [MO_16 | MO_LE] = gen_helper_atomic_##NAME##w_le,                   \
+    [MO_16 | MO_BE] = gen_helper_atomic_##NAME##w_be,                   \
+    [MO_32 | MO_LE] = gen_helper_atomic_##NAME##l_le,                   \
+    [MO_32 | MO_BE] = gen_helper_atomic_##NAME##l_be,                   \
+    [MO_64 | MO_LE] = gen_helper_atomic_##NAME##q_le,                   \
+    [MO_64 | MO_BE] = gen_helper_atomic_##NAME##q_be,                   \
+};                                                                      \
+void tcg_gen_atomic_##NAME##_i32                                        \
+    (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, TCGMemOp memop) \
+{                                                                       \
+    if (parallel_cpus) {                                                \
+        do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
+    } else {                                                            \
+        do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
+                            tcg_gen_##OP##_i32);                        \
+    }                                                                   \
+}                                                                       \
+void tcg_gen_atomic_##NAME##_i64                                        \
+    (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, TCGMemOp memop) \
+{                                                                       \
+    if (parallel_cpus) {                                                \
+        do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
+    } else {                                                            \
+        do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
+                            tcg_gen_##OP##_i64);                        \
+    }                                                                   \
+}
+
+GEN_ATOMIC_HELPER(fetch_add, add, 0)
+GEN_ATOMIC_HELPER(fetch_and, and, 0)
+GEN_ATOMIC_HELPER(fetch_or, or, 0)
+GEN_ATOMIC_HELPER(fetch_xor, xor, 0)
+
+GEN_ATOMIC_HELPER(add_fetch, add, 1)
+GEN_ATOMIC_HELPER(and_fetch, and, 1)
+GEN_ATOMIC_HELPER(or_fetch, or, 1)
+GEN_ATOMIC_HELPER(xor_fetch, xor, 1)
+
+static void tcg_gen_mov2_i32(TCGv_i32 r, TCGv_i32 a, TCGv_i32 b)
+{
+    tcg_gen_mov_i32(r, b);
+}
+
+static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b)
+{
+    tcg_gen_mov_i64(r, b);
+}
+
+GEN_ATOMIC_HELPER(xchg, mov2, 0)
+
+#undef GEN_ATOMIC_HELPER
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 02cb3766819d..89b59e867ac9 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -854,6 +854,30 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
     tcg_gen_qemu_st_i64(arg, addr, mem_index, MO_TEQ);
 }
 
+void tcg_gen_atomic_cmpxchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGv_i32,
+                                TCGArg, TCGMemOp);
+void tcg_gen_atomic_cmpxchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGv_i64,
+                                TCGArg, TCGMemOp);
+
+void tcg_gen_atomic_xchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_xchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_add_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_add_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_and_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_and_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_or_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_or_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_xor_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_xor_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_add_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_add_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_and_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_and_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_or_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+
 #if TARGET_LONG_BITS == 64
 #define tcg_gen_movi_tl tcg_gen_movi_i64
 #define tcg_gen_mov_tl tcg_gen_mov_i64
@@ -932,6 +956,16 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_sub2_tl tcg_gen_sub2_i64
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i64
 #define tcg_gen_muls2_tl tcg_gen_muls2_i64
+#define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i64
+#define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i64
+#define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i64
+#define tcg_gen_atomic_fetch_and_tl tcg_gen_atomic_fetch_and_i64
+#define tcg_gen_atomic_fetch_or_tl tcg_gen_atomic_fetch_or_i64
+#define tcg_gen_atomic_fetch_xor_tl tcg_gen_atomic_fetch_xor_i64
+#define tcg_gen_atomic_add_fetch_tl tcg_gen_atomic_add_fetch_i64
+#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i64
+#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i64
+#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i64
 #else
 #define tcg_gen_movi_tl tcg_gen_movi_i32
 #define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -1009,6 +1043,16 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_sub2_tl tcg_gen_sub2_i32
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i32
 #define tcg_gen_muls2_tl tcg_gen_muls2_i32
+#define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i32
+#define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i32
+#define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i32
+#define tcg_gen_atomic_fetch_and_tl tcg_gen_atomic_fetch_and_i32
+#define tcg_gen_atomic_fetch_or_tl tcg_gen_atomic_fetch_or_i32
+#define tcg_gen_atomic_fetch_xor_tl tcg_gen_atomic_fetch_xor_i32
+#define tcg_gen_atomic_add_fetch_tl tcg_gen_atomic_add_fetch_i32
+#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i32
+#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i32
+#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i32
 #endif
 
 #if UINTPTR_MAX == UINT32_MAX
diff --git a/tcg/tcg-runtime.h b/tcg/tcg-runtime.h
index 23a0c37711c2..22367aaf974a 100644
--- a/tcg/tcg-runtime.h
+++ b/tcg/tcg-runtime.h
@@ -14,3 +14,78 @@ DEF_HELPER_FLAGS_2(sar_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 
 DEF_HELPER_FLAGS_2(mulsh_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 DEF_HELPER_FLAGS_2(muluh_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+
+#ifdef CONFIG_SOFTMMU
+
+DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgl_be, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgq_be, TCG_CALL_NO_WG,
+                   i64, env, tl, i64, i64, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgw_le, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgl_le, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgq_le, TCG_CALL_NO_WG,
+                   i64, env, tl, i64, i64, i32)
+
+#define GEN_ATOMIC_HELPERS(NAME)                                  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),              \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_le),           \
+                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_be),           \
+                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)
+
+#else
+
+DEF_HELPER_FLAGS_4(atomic_cmpxchgb, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgw_be, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgl_be, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgq_be, TCG_CALL_NO_WG, i64, env, tl, i64, i64)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgw_le, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgl_le, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgq_le, TCG_CALL_NO_WG, i64, env, tl, i64, i64)
+
+#define GEN_ATOMIC_HELPERS(NAME)                             \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), b),         \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), w_le),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), w_be),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), l_le),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), l_be),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), q_le),      \
+                       TCG_CALL_NO_WG, i64, env, tl, i64)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), q_be),      \
+                       TCG_CALL_NO_WG, i64, env, tl, i64)
+
+#endif /* CONFIG_SOFTMMU */
+
+GEN_ATOMIC_HELPERS(fetch_add)
+GEN_ATOMIC_HELPERS(fetch_and)
+GEN_ATOMIC_HELPERS(fetch_or)
+GEN_ATOMIC_HELPERS(fetch_xor)
+
+GEN_ATOMIC_HELPERS(add_fetch)
+GEN_ATOMIC_HELPERS(and_fetch)
+GEN_ATOMIC_HELPERS(or_fetch)
+GEN_ATOMIC_HELPERS(xor_fetch)
+
+GEN_ATOMIC_HELPERS(xchg)
+
+#undef GEN_ATOMIC_HELPERS
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 3b211560c879..593196571ee0 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -1177,6 +1177,59 @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
 # define helper_ret_ldq_cmmu  helper_le_ldq_cmmu
 #endif
 
+uint32_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
+                                    uint32_t cmpv, uint32_t newv,
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgw_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgw_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+
+#define GEN_ATOMIC_HELPER(NAME, TYPE, SUFFIX)         \
+TYPE helper_atomic_ ## NAME ## SUFFIX ## _mmu         \
+    (CPUArchState *env, target_ulong addr, TYPE val,  \
+     TCGMemOpIdx oi, uintptr_t retaddr);
+
+#define GEN_ATOMIC_HELPER_ALL(NAME)          \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, b)      \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_be)
+
+GEN_ATOMIC_HELPER_ALL(fetch_add)
+GEN_ATOMIC_HELPER_ALL(fetch_sub)
+GEN_ATOMIC_HELPER_ALL(fetch_and)
+GEN_ATOMIC_HELPER_ALL(fetch_or)
+GEN_ATOMIC_HELPER_ALL(fetch_xor)
+
+GEN_ATOMIC_HELPER_ALL(add_fetch)
+GEN_ATOMIC_HELPER_ALL(sub_fetch)
+GEN_ATOMIC_HELPER_ALL(and_fetch)
+GEN_ATOMIC_HELPER_ALL(or_fetch)
+GEN_ATOMIC_HELPER_ALL(xor_fetch)
+
+GEN_ATOMIC_HELPER_ALL(xchg)
+
+#undef GEN_ATOMIC_HELPER_ALL
+#undef GEN_ATOMIC_HELPER
+
 #endif /* CONFIG_SOFTMMU */
 
 #endif /* TCG_H */

From 7ebee43ee3e2fcd7b5063058b7ef74bc43216733 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Wed, 29 Jun 2016 21:10:59 -0700
Subject: [PATCH 16/37] tcg: Add atomic128 helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Force the use of cmpxchg16b on x86_64.

Wikipedia suggests that only very old AMD64 (circa 2004) did not have
this instruction.  Further, it's required by Windows 8 so no new cpus
will ever omit it.

If we truely care about these, then we could check this at startup time
and then avoid executing paths that use it.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 atomic_template.h     | 40 +++++++++++++++++++++++++++++++++++++++-
 configure             | 29 ++++++++++++++++++++++++++++-
 cputlb.c              |  5 +++++
 include/qemu/int128.h |  6 ++++++
 tcg-runtime.c         | 18 ++++++++++++++++++
 tcg/tcg.h             | 24 +++++++++++++++++++++++-
 6 files changed, 119 insertions(+), 3 deletions(-)

diff --git a/atomic_template.h b/atomic_template.h
index cf837254034b..b400b2a3d3a1 100644
--- a/atomic_template.h
+++ b/atomic_template.h
@@ -18,7 +18,11 @@
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
-#if DATA_SIZE == 8
+#if DATA_SIZE == 16
+# define SUFFIX     o
+# define DATA_TYPE  Int128
+# define BSWAP      bswap128
+#elif DATA_SIZE == 8
 # define SUFFIX     q
 # define DATA_TYPE  uint64_t
 # define BSWAP      bswap64
@@ -61,6 +65,21 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
     return atomic_cmpxchg__nocheck(haddr, cmpv, newv);
 }
 
+#if DATA_SIZE >= 16
+ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
+{
+    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+    return val;
+}
+
+void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
+                     ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+}
+#else
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
                            ABI_TYPE val EXTRA_ARGS)
 {
@@ -86,6 +105,8 @@ GEN_ATOMIC_HELPER(or_fetch)
 GEN_ATOMIC_HELPER(xor_fetch)
 
 #undef GEN_ATOMIC_HELPER
+#endif /* DATA SIZE >= 16 */
+
 #undef END
 
 #if DATA_SIZE > 1
@@ -105,6 +126,22 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
     return BSWAP(atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv)));
 }
 
+#if DATA_SIZE >= 16
+ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
+{
+    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+    return BSWAP(val);
+}
+
+void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
+                     ABI_TYPE val EXTRA_ARGS)
+{
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    val = BSWAP(val);
+    __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+}
+#else
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
                            ABI_TYPE val EXTRA_ARGS)
 {
@@ -166,6 +203,7 @@ ABI_TYPE ATOMIC_NAME(add_fetch)(CPUArchState *env, target_ulong addr,
         ldo = ldn;
     }
 }
+#endif /* DATA_SIZE >= 16 */
 
 #undef END
 #endif /* DATA_SIZE > 1 */
diff --git a/configure b/configure
index d3dafcbb373e..cc3301fa0a89 100755
--- a/configure
+++ b/configure
@@ -1216,7 +1216,10 @@ case "$cpu" in
            cc_i386='$(CC) -m32'
            ;;
     x86_64)
-           CPU_CFLAGS="-m64"
+           # ??? Only extremely old AMD cpus do not have cmpxchg16b.
+           # If we truly care, we should simply detect this case at
+           # runtime and generate the fallback to serial emulation.
+           CPU_CFLAGS="-m64 -mcx16"
            LDFLAGS="-m64 $LDFLAGS"
            cc_i386='$(CC) -m32'
            ;;
@@ -4521,6 +4524,26 @@ if compile_prog "" "" ; then
     int128=yes
 fi
 
+#########################################
+# See if 128-bit atomic operations are supported.
+
+atomic128=no
+if test "$int128" = "yes"; then
+  cat > $TMPC << EOF
+int main(void)
+{
+  unsigned __int128 x = 0, y = 0;
+  y = __atomic_load_16(&x, 0);
+  __atomic_store_16(&x, y, 0);
+  __atomic_compare_exchange_16(&x, &y, x, 0, 0, 0);
+  return 0;
+}
+EOF
+  if compile_prog "" "" ; then
+    atomic128=yes
+  fi
+fi
+
 ########################################
 # check if getauxval is available.
 
@@ -5483,6 +5506,10 @@ if test "$int128" = "yes" ; then
   echo "CONFIG_INT128=y" >> $config_host_mak
 fi
 
+if test "$atomic128" = "yes" ; then
+  echo "CONFIG_ATOMIC128=y" >> $config_host_mak
+fi
+
 if test "$getauxval" = "yes" ; then
   echo "CONFIG_GETAUXVAL=y" >> $config_host_mak
 fi
diff --git a/cputlb.c b/cputlb.c
index 4f2c5002f6d5..845b2a7acf24 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -690,6 +690,11 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #define DATA_SIZE 8
 #include "atomic_template.h"
 
+#ifdef CONFIG_ATOMIC128
+#define DATA_SIZE 16
+#include "atomic_template.h"
+#endif
+
 /* Second set of helpers are directly callable from TCG as helpers.  */
 
 #undef EXTRA_ARGS
diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index d4c6e44331f8..5c9890db8b39 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -2,6 +2,7 @@
 #define INT128_H
 
 #ifdef CONFIG_INT128
+#include "qemu/bswap.h"
 
 typedef __int128_t Int128;
 
@@ -137,6 +138,11 @@ static inline void int128_subfrom(Int128 *a, Int128 b)
     *a -= b;
 }
 
+static inline Int128 bswap128(Int128 a)
+{
+    return int128_make128(bswap64(int128_gethi(a)), bswap64(int128_getlo(a)));
+}
+
 #else /* !CONFIG_INT128 */
 
 typedef struct Int128 Int128;
diff --git a/tcg-runtime.c b/tcg-runtime.c
index aa55d12e1273..e9521531cd18 100644
--- a/tcg-runtime.c
+++ b/tcg-runtime.c
@@ -133,4 +133,22 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #define DATA_SIZE 8
 #include "atomic_template.h"
 
+/* The following is only callable from other helpers, and matches up
+   with the softmmu version.  */
+
+#ifdef CONFIG_ATOMIC128
+
+#undef EXTRA_ARGS
+#undef ATOMIC_NAME
+#undef ATOMIC_MMU_LOOKUP
+
+#define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
+#define ATOMIC_NAME(X) \
+    HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, retaddr)
+
+#define DATA_SIZE 16
+#include "atomic_template.h"
+#endif /* CONFIG_ATOMIC128 */
+
 #endif /* !CONFIG_SOFTMMU */
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 593196571ee0..bc3ea7af1968 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -1229,7 +1229,29 @@ GEN_ATOMIC_HELPER_ALL(xchg)
 
 #undef GEN_ATOMIC_HELPER_ALL
 #undef GEN_ATOMIC_HELPER
-
 #endif /* CONFIG_SOFTMMU */
 
+#ifdef CONFIG_ATOMIC128
+#include "qemu/int128.h"
+
+/* These aren't really a "proper" helpers because TCG cannot manage Int128.
+   However, use the same format as the others, for use by the backends. */
+Int128 helper_atomic_cmpxchgo_le_mmu(CPUArchState *env, target_ulong addr,
+                                     Int128 cmpv, Int128 newv,
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
+Int128 helper_atomic_cmpxchgo_be_mmu(CPUArchState *env, target_ulong addr,
+                                     Int128 cmpv, Int128 newv,
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
+
+Int128 helper_atomic_ldo_le_mmu(CPUArchState *env, target_ulong addr,
+                                TCGMemOpIdx oi, uintptr_t retaddr);
+Int128 helper_atomic_ldo_be_mmu(CPUArchState *env, target_ulong addr,
+                                TCGMemOpIdx oi, uintptr_t retaddr);
+void helper_atomic_sto_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                              TCGMemOpIdx oi, uintptr_t retaddr);
+void helper_atomic_sto_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                              TCGMemOpIdx oi, uintptr_t retaddr);
+
+#endif /* CONFIG_ATOMIC128 */
+
 #endif /* TCG_H */

From df79b996a7b21c6ea7847f7927a2e1a294b86c72 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 2 Sep 2016 12:23:57 -0700
Subject: [PATCH 17/37] tcg: Add CONFIG_ATOMIC64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow qemu to build on 32-bit hosts without 64-bit atomic ops.

Even if we only allow 32-bit hosts to multi-thread emulate 32-bit
guests, we still need some way to handle the 32-bit guest using a
64-bit atomic operation.  Do so by dropping back to single-step.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 configure         | 33 +++++++++++++++++++++++++++++++++
 cputlb.c          |  4 ++++
 tcg-runtime.c     |  7 +++++++
 tcg/tcg-op.c      | 22 ++++++++++++++++++----
 tcg/tcg-runtime.h | 46 ++++++++++++++++++++++++++++++++++++++++------
 tcg/tcg.h         | 15 ++++++++++++---
 6 files changed, 114 insertions(+), 13 deletions(-)

diff --git a/configure b/configure
index cc3301fa0a89..8e100596077d 100755
--- a/configure
+++ b/configure
@@ -4544,6 +4544,35 @@ EOF
   fi
 fi
 
+#########################################
+# See if 64-bit atomic operations are supported.
+# Note that without __atomic builtins, we can only
+# assume atomic loads/stores max at pointer size.
+
+cat > $TMPC << EOF
+#include <stdint.h>
+int main(void)
+{
+  uint64_t x = 0, y = 0;
+#ifdef __ATOMIC_RELAXED
+  y = __atomic_load_8(&x, 0);
+  __atomic_store_8(&x, y, 0);
+  __atomic_compare_exchange_8(&x, &y, x, 0, 0, 0);
+  __atomic_exchange_8(&x, y, 0);
+  __atomic_fetch_add_8(&x, y, 0);
+#else
+  typedef char is_host64[sizeof(void *) >= sizeof(uint64_t) ? 1 : -1];
+  __sync_lock_test_and_set(&x, y);
+  __sync_val_compare_and_swap(&x, y, 0);
+  __sync_fetch_and_add(&x, y);
+#endif
+  return 0;
+}
+EOF
+if compile_prog "" "" ; then
+  atomic64=yes
+fi
+
 ########################################
 # check if getauxval is available.
 
@@ -5510,6 +5539,10 @@ if test "$atomic128" = "yes" ; then
   echo "CONFIG_ATOMIC128=y" >> $config_host_mak
 fi
 
+if test "$atomic64" = "yes" ; then
+  echo "CONFIG_ATOMIC64=y" >> $config_host_mak
+fi
+
 if test "$getauxval" = "yes" ; then
   echo "CONFIG_GETAUXVAL=y" >> $config_host_mak
 fi
diff --git a/cputlb.c b/cputlb.c
index 845b2a7acf24..cc4da4d7ebc9 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -687,8 +687,10 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #define DATA_SIZE 4
 #include "atomic_template.h"
 
+#ifdef CONFIG_ATOMIC64
 #define DATA_SIZE 8
 #include "atomic_template.h"
+#endif
 
 #ifdef CONFIG_ATOMIC128
 #define DATA_SIZE 16
@@ -713,8 +715,10 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #define DATA_SIZE 4
 #include "atomic_template.h"
 
+#ifdef CONFIG_ATOMIC64
 #define DATA_SIZE 8
 #include "atomic_template.h"
+#endif
 
 /* Code access functions.  */
 
diff --git a/tcg-runtime.c b/tcg-runtime.c
index e9521531cd18..9327b6f23bea 100644
--- a/tcg-runtime.c
+++ b/tcg-runtime.c
@@ -101,6 +101,11 @@ int64_t HELPER(mulsh_i64)(int64_t arg1, int64_t arg2)
     return h;
 }
 
+void HELPER(exit_atomic)(CPUArchState *env)
+{
+    cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
+}
+
 #ifndef CONFIG_SOFTMMU
 /* The softmmu versions of these helpers are in cputlb.c.  */
 
@@ -130,8 +135,10 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #define DATA_SIZE 4
 #include "atomic_template.h"
 
+#ifdef CONFIG_ATOMIC64
 #define DATA_SIZE 8
 #include "atomic_template.h"
+#endif
 
 /* The following is only callable from other helpers, and matches up
    with the softmmu version.  */
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 65e36637d029..cdd61d678bcd 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2040,14 +2040,20 @@ typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv_env, TCGv, TCGv_i32);
 typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv_env, TCGv, TCGv_i64);
 #endif
 
+#ifdef CONFIG_ATOMIC64
+# define WITH_ATOMIC64(X) X,
+#else
+# define WITH_ATOMIC64(X)
+#endif
+
 static void * const table_cmpxchg[16] = {
     [MO_8] = gen_helper_atomic_cmpxchgb,
     [MO_16 | MO_LE] = gen_helper_atomic_cmpxchgw_le,
     [MO_16 | MO_BE] = gen_helper_atomic_cmpxchgw_be,
     [MO_32 | MO_LE] = gen_helper_atomic_cmpxchgl_le,
     [MO_32 | MO_BE] = gen_helper_atomic_cmpxchgl_be,
-    [MO_64 | MO_LE] = gen_helper_atomic_cmpxchgq_le,
-    [MO_64 | MO_BE] = gen_helper_atomic_cmpxchgq_be,
+    WITH_ATOMIC64([MO_64 | MO_LE] = gen_helper_atomic_cmpxchgq_le)
+    WITH_ATOMIC64([MO_64 | MO_BE] = gen_helper_atomic_cmpxchgq_be)
 };
 
 void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
@@ -2117,6 +2123,7 @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
         }
         tcg_temp_free_i64(t1);
     } else if ((memop & MO_SIZE) == MO_64) {
+#ifdef CONFIG_ATOMIC64
         gen_atomic_cx_i64 gen;
 
         gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)];
@@ -2131,6 +2138,9 @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
 #else
         gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv);
 #endif
+#else
+        gen_helper_exit_atomic(tcg_ctx.tcg_env);
+#endif /* CONFIG_ATOMIC64 */
     } else {
         TCGv_i32 c32 = tcg_temp_new_i32();
         TCGv_i32 n32 = tcg_temp_new_i32();
@@ -2218,6 +2228,7 @@ static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
     memop = tcg_canonicalize_memop(memop, 1, 0);
 
     if ((memop & MO_SIZE) == MO_64) {
+#ifdef CONFIG_ATOMIC64
         gen_atomic_op_i64 gen;
 
         gen = table[memop & (MO_SIZE | MO_BSWAP)];
@@ -2232,6 +2243,9 @@ static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
 #else
         gen(ret, tcg_ctx.tcg_env, addr, val);
 #endif
+#else
+        gen_helper_exit_atomic(tcg_ctx.tcg_env);
+#endif /* CONFIG_ATOMIC64 */
     } else {
         TCGv_i32 v32 = tcg_temp_new_i32();
         TCGv_i32 r32 = tcg_temp_new_i32();
@@ -2256,8 +2270,8 @@ static void * const table_##NAME[16] = {                                \
     [MO_16 | MO_BE] = gen_helper_atomic_##NAME##w_be,                   \
     [MO_32 | MO_LE] = gen_helper_atomic_##NAME##l_le,                   \
     [MO_32 | MO_BE] = gen_helper_atomic_##NAME##l_be,                   \
-    [MO_64 | MO_LE] = gen_helper_atomic_##NAME##q_le,                   \
-    [MO_64 | MO_BE] = gen_helper_atomic_##NAME##q_be,                   \
+    WITH_ATOMIC64([MO_64 | MO_LE] = gen_helper_atomic_##NAME##q_le)     \
+    WITH_ATOMIC64([MO_64 | MO_BE] = gen_helper_atomic_##NAME##q_be)     \
 };                                                                      \
 void tcg_gen_atomic_##NAME##_i32                                        \
     (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, TCGMemOp memop) \
diff --git a/tcg/tcg-runtime.h b/tcg/tcg-runtime.h
index 22367aaf974a..1deb86a0997f 100644
--- a/tcg/tcg-runtime.h
+++ b/tcg/tcg-runtime.h
@@ -15,23 +15,28 @@ DEF_HELPER_FLAGS_2(sar_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 DEF_HELPER_FLAGS_2(mulsh_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 DEF_HELPER_FLAGS_2(muluh_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 
+DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
+
 #ifdef CONFIG_SOFTMMU
 
 DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
                    i32, env, tl, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG,
                    i32, env, tl, i32, i32, i32)
-DEF_HELPER_FLAGS_5(atomic_cmpxchgl_be, TCG_CALL_NO_WG,
-                   i32, env, tl, i32, i32, i32)
-DEF_HELPER_FLAGS_5(atomic_cmpxchgq_be, TCG_CALL_NO_WG,
-                   i64, env, tl, i64, i64, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgw_le, TCG_CALL_NO_WG,
                    i32, env, tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgl_be, TCG_CALL_NO_WG,
+                   i32, env, tl, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgl_le, TCG_CALL_NO_WG,
                    i32, env, tl, i32, i32, i32)
+#ifdef CONFIG_ATOMIC64
+DEF_HELPER_FLAGS_5(atomic_cmpxchgq_be, TCG_CALL_NO_WG,
+                   i64, env, tl, i64, i64, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgq_le, TCG_CALL_NO_WG,
                    i64, env, tl, i64, i64, i32)
+#endif
 
+#ifdef CONFIG_ATOMIC64
 #define GEN_ATOMIC_HELPERS(NAME)                                  \
     DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),              \
                        TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
@@ -47,17 +52,33 @@ DEF_HELPER_FLAGS_5(atomic_cmpxchgq_le, TCG_CALL_NO_WG,
                        TCG_CALL_NO_WG, i64, env, tl, i64, i32)    \
     DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_be),           \
                        TCG_CALL_NO_WG, i64, env, tl, i64, i32)
+#else
+#define GEN_ATOMIC_HELPERS(NAME)                                  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),              \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)    \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),           \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+#endif /* CONFIG_ATOMIC64 */
 
 #else
 
 DEF_HELPER_FLAGS_4(atomic_cmpxchgb, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
 DEF_HELPER_FLAGS_4(atomic_cmpxchgw_be, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
-DEF_HELPER_FLAGS_4(atomic_cmpxchgl_be, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
-DEF_HELPER_FLAGS_4(atomic_cmpxchgq_be, TCG_CALL_NO_WG, i64, env, tl, i64, i64)
 DEF_HELPER_FLAGS_4(atomic_cmpxchgw_le, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+DEF_HELPER_FLAGS_4(atomic_cmpxchgl_be, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
 DEF_HELPER_FLAGS_4(atomic_cmpxchgl_le, TCG_CALL_NO_WG, i32, env, tl, i32, i32)
+#ifdef CONFIG_ATOMIC64
+DEF_HELPER_FLAGS_4(atomic_cmpxchgq_be, TCG_CALL_NO_WG, i64, env, tl, i64, i64)
 DEF_HELPER_FLAGS_4(atomic_cmpxchgq_le, TCG_CALL_NO_WG, i64, env, tl, i64, i64)
+#endif
 
+#ifdef CONFIG_ATOMIC64
 #define GEN_ATOMIC_HELPERS(NAME)                             \
     DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), b),         \
                        TCG_CALL_NO_WG, i32, env, tl, i32)    \
@@ -73,6 +94,19 @@ DEF_HELPER_FLAGS_4(atomic_cmpxchgq_le, TCG_CALL_NO_WG, i64, env, tl, i64, i64)
                        TCG_CALL_NO_WG, i64, env, tl, i64)    \
     DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), q_be),      \
                        TCG_CALL_NO_WG, i64, env, tl, i64)
+#else
+#define GEN_ATOMIC_HELPERS(NAME)                             \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), b),         \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), w_le),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), w_be),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), l_le),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)    \
+    DEF_HELPER_FLAGS_3(glue(glue(atomic_, NAME), l_be),      \
+                       TCG_CALL_NO_WG, i32, env, tl, i32)
+#endif /* CONFIG_ATOMIC64 */
 
 #endif /* CONFIG_SOFTMMU */
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index bc3ea7af1968..b34b5fbc2fa7 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -1204,14 +1204,23 @@ TYPE helper_atomic_ ## NAME ## SUFFIX ## _mmu         \
     (CPUArchState *env, target_ulong addr, TYPE val,  \
      TCGMemOpIdx oi, uintptr_t retaddr);
 
+#ifdef CONFIG_ATOMIC64
 #define GEN_ATOMIC_HELPER_ALL(NAME)          \
-    GEN_ATOMIC_HELPER(NAME, uint32_t, b)      \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, b)     \
     GEN_ATOMIC_HELPER(NAME, uint32_t, w_le)  \
-    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
-    GEN_ATOMIC_HELPER(NAME, uint64_t, q_le)  \
     GEN_ATOMIC_HELPER(NAME, uint32_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
     GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_le)  \
     GEN_ATOMIC_HELPER(NAME, uint64_t, q_be)
+#else
+#define GEN_ATOMIC_HELPER_ALL(NAME)          \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, b)     \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)
+#endif
 
 GEN_ATOMIC_HELPER_ALL(fetch_add)
 GEN_ATOMIC_HELPER_ALL(fetch_sub)

From 91682118aa330aff7e8ef0cc685c32d101f49940 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 16 Sep 2016 09:24:20 -0700
Subject: [PATCH 18/37] tcg: Emit barriers with parallel_cpus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/tcg-op.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index cdd61d678bcd..bb2bfeef3c33 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -150,17 +150,7 @@ void tcg_gen_op6(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2,
 
 void tcg_gen_mb(TCGBar mb_type)
 {
-    bool emit_barriers = true;
-
-#ifndef CONFIG_USER_ONLY
-    /* TODO: When MTTCG is available for system mode, we will check
-     * the following condition and enable emit_barriers
-     * (qemu_tcg_mttcg_enabled() && smp_cpus > 1)
-     */
-    emit_barriers = false;
-#endif
-
-    if (emit_barriers) {
+    if (parallel_cpus) {
         tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type);
     }
 }

From ae03f8de45427042ecd10b0941a005f21ecc064c Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:01:51 -0400
Subject: [PATCH 19/37] target-i386: emulate LOCK'ed cmpxchg using cmpxchg
 helpers

The diff here is uglier than necessary. All this does is to turn

FOO

into:

if (s->prefix & PREFIX_LOCK) {
  BAR
} else {
  FOO
}

where FOO is the original implementation of an unlocked cmpxchg.

[rth: Adjust unlocked cmpxchg to use movcond instead of branches.
Adjust helpers to use atomic helpers.]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-6-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/helper.h     |   2 +
 target-i386/mem_helper.c | 134 +++++++++++++++++++++++++++++++++------
 target-i386/translate.c  |  99 ++++++++++++++++-------------
 3 files changed, 169 insertions(+), 66 deletions(-)

diff --git a/target-i386/helper.h b/target-i386/helper.h
index 1320edc0164a..729d4b63da57 100644
--- a/target-i386/helper.h
+++ b/target-i386/helper.h
@@ -74,8 +74,10 @@ DEF_HELPER_3(boundw, void, env, tl, int)
 DEF_HELPER_3(boundl, void, env, tl, int)
 DEF_HELPER_1(rsm, void, env)
 DEF_HELPER_2(into, void, env, int)
+DEF_HELPER_2(cmpxchg8b_unlocked, void, env, tl)
 DEF_HELPER_2(cmpxchg8b, void, env, tl)
 #ifdef TARGET_X86_64
+DEF_HELPER_2(cmpxchg16b_unlocked, void, env, tl)
 DEF_HELPER_2(cmpxchg16b, void, env, tl)
 #endif
 DEF_HELPER_1(single_step, void, env)
diff --git a/target-i386/mem_helper.c b/target-i386/mem_helper.c
index 5bc0594dfae3..c4b5c5bd94c7 100644
--- a/target-i386/mem_helper.c
+++ b/target-i386/mem_helper.c
@@ -22,6 +22,8 @@
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
+#include "qemu/int128.h"
+#include "tcg.h"
 
 /* broken thread support */
 
@@ -56,53 +58,143 @@ void helper_lock_init(void)
 }
 #endif
 
+void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
+{
+    uintptr_t ra = GETPC();
+    uint64_t oldv, cmpv, newv;
+    int eflags;
+
+    eflags = cpu_cc_compute_all(env, CC_OP);
+
+    cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
+    newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
+
+    oldv = cpu_ldq_data_ra(env, a0, ra);
+    newv = (cmpv == oldv ? newv : oldv);
+    /* always do the store */
+    cpu_stq_data_ra(env, a0, newv, ra);
+
+    if (oldv == cmpv) {
+        eflags |= CC_Z;
+    } else {
+        env->regs[R_EAX] = (uint32_t)oldv;
+        env->regs[R_EDX] = (uint32_t)(oldv >> 32);
+        eflags &= ~CC_Z;
+    }
+    CC_SRC = eflags;
+}
+
 void helper_cmpxchg8b(CPUX86State *env, target_ulong a0)
 {
-    uint64_t d;
+#ifdef CONFIG_ATOMIC64
+    uint64_t oldv, cmpv, newv;
     int eflags;
 
     eflags = cpu_cc_compute_all(env, CC_OP);
-    d = cpu_ldq_data_ra(env, a0, GETPC());
-    if (d == (((uint64_t)env->regs[R_EDX] << 32) | (uint32_t)env->regs[R_EAX])) {
-        cpu_stq_data_ra(env, a0, ((uint64_t)env->regs[R_ECX] << 32)
-                                  | (uint32_t)env->regs[R_EBX], GETPC());
+
+    cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
+    newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
+
+#ifdef CONFIG_USER_ONLY
+    {
+        uint64_t *haddr = g2h(a0);
+        cmpv = cpu_to_le64(cmpv);
+        newv = cpu_to_le64(newv);
+        oldv = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
+        oldv = le64_to_cpu(oldv);
+    }
+#else
+    {
+        uintptr_t ra = GETPC();
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi = make_memop_idx(MO_TEQ, mem_idx);
+        oldv = helper_atomic_cmpxchgq_le_mmu(env, a0, cmpv, newv, oi, ra);
+    }
+#endif
+
+    if (oldv == cmpv) {
         eflags |= CC_Z;
     } else {
-        /* always do the store */
-        cpu_stq_data_ra(env, a0, d, GETPC());
-        env->regs[R_EDX] = (uint32_t)(d >> 32);
-        env->regs[R_EAX] = (uint32_t)d;
+        env->regs[R_EAX] = (uint32_t)oldv;
+        env->regs[R_EDX] = (uint32_t)(oldv >> 32);
         eflags &= ~CC_Z;
     }
     CC_SRC = eflags;
+#else
+    cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
+#endif /* CONFIG_ATOMIC64 */
 }
 
 #ifdef TARGET_X86_64
-void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
+void helper_cmpxchg16b_unlocked(CPUX86State *env, target_ulong a0)
 {
-    uint64_t d0, d1;
+    uintptr_t ra = GETPC();
+    Int128 oldv, cmpv, newv;
+    uint64_t o0, o1;
     int eflags;
+    bool success;
 
     if ((a0 & 0xf) != 0) {
         raise_exception_ra(env, EXCP0D_GPF, GETPC());
     }
     eflags = cpu_cc_compute_all(env, CC_OP);
-    d0 = cpu_ldq_data_ra(env, a0, GETPC());
-    d1 = cpu_ldq_data_ra(env, a0 + 8, GETPC());
-    if (d0 == env->regs[R_EAX] && d1 == env->regs[R_EDX]) {
-        cpu_stq_data_ra(env, a0, env->regs[R_EBX], GETPC());
-        cpu_stq_data_ra(env, a0 + 8, env->regs[R_ECX], GETPC());
+
+    cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
+    newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
+
+    o0 = cpu_ldq_data_ra(env, a0 + 0, ra);
+    o1 = cpu_ldq_data_ra(env, a0 + 8, ra);
+
+    oldv = int128_make128(o0, o1);
+    success = int128_eq(oldv, cmpv);
+    if (!success) {
+        newv = oldv;
+    }
+
+    cpu_stq_data_ra(env, a0 + 0, int128_getlo(newv), ra);
+    cpu_stq_data_ra(env, a0 + 8, int128_gethi(newv), ra);
+
+    if (success) {
         eflags |= CC_Z;
     } else {
-        /* always do the store */
-        cpu_stq_data_ra(env, a0, d0, GETPC());
-        cpu_stq_data_ra(env, a0 + 8, d1, GETPC());
-        env->regs[R_EDX] = d1;
-        env->regs[R_EAX] = d0;
+        env->regs[R_EAX] = int128_getlo(oldv);
+        env->regs[R_EDX] = int128_gethi(oldv);
         eflags &= ~CC_Z;
     }
     CC_SRC = eflags;
 }
+
+void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
+{
+    uintptr_t ra = GETPC();
+
+    if ((a0 & 0xf) != 0) {
+        raise_exception_ra(env, EXCP0D_GPF, ra);
+    } else {
+#ifndef CONFIG_ATOMIC128
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+#else
+        int eflags = cpu_cc_compute_all(env, CC_OP);
+
+        Int128 cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
+        Int128 newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
+
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
+        Int128 oldv = helper_atomic_cmpxchgo_le_mmu(env, a0, cmpv,
+                                                    newv, oi, ra);
+
+        if (int128_eq(oldv, cmpv)) {
+            eflags |= CC_Z;
+        } else {
+            env->regs[R_EAX] = int128_getlo(oldv);
+            env->regs[R_EDX] = int128_gethi(oldv);
+            eflags &= ~CC_Z;
+        }
+        CC_SRC = eflags;
+#endif
+    }
+}
 #endif
 
 void helper_boundw(CPUX86State *env, target_ulong a0, int v)
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 23fde58547ab..dd1b408642e9 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -5069,57 +5069,58 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     case 0x1b0:
     case 0x1b1: /* cmpxchg Ev, Gv */
         {
-            TCGLabel *label1, *label2;
-            TCGv t0, t1, t2, a0;
+            TCGv oldv, newv, cmpv;
 
             ot = mo_b_d(b, dflag);
             modrm = cpu_ldub_code(env, s->pc++);
             reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
-            t0 = tcg_temp_local_new();
-            t1 = tcg_temp_local_new();
-            t2 = tcg_temp_local_new();
-            a0 = tcg_temp_local_new();
-            gen_op_mov_v_reg(ot, t1, reg);
-            if (mod == 3) {
-                rm = (modrm & 7) | REX_B(s);
-                gen_op_mov_v_reg(ot, t0, rm);
-            } else {
+            oldv = tcg_temp_new();
+            newv = tcg_temp_new();
+            cmpv = tcg_temp_new();
+            gen_op_mov_v_reg(ot, newv, reg);
+            tcg_gen_mov_tl(cmpv, cpu_regs[R_EAX]);
+
+            if (s->prefix & PREFIX_LOCK) {
+                if (mod == 3) {
+                    goto illegal_op;
+                }
                 gen_lea_modrm(env, s, modrm);
-                tcg_gen_mov_tl(a0, cpu_A0);
-                gen_op_ld_v(s, ot, t0, a0);
-                rm = 0; /* avoid warning */
-            }
-            label1 = gen_new_label();
-            tcg_gen_mov_tl(t2, cpu_regs[R_EAX]);
-            gen_extu(ot, t0);
-            gen_extu(ot, t2);
-            tcg_gen_brcond_tl(TCG_COND_EQ, t2, t0, label1);
-            label2 = gen_new_label();
-            if (mod == 3) {
-                gen_op_mov_reg_v(ot, R_EAX, t0);
-                tcg_gen_br(label2);
-                gen_set_label(label1);
-                gen_op_mov_reg_v(ot, rm, t1);
+                tcg_gen_atomic_cmpxchg_tl(oldv, cpu_A0, cmpv, newv,
+                                          s->mem_index, ot | MO_LE);
+                gen_op_mov_reg_v(ot, R_EAX, oldv);
             } else {
-                /* perform no-op store cycle like physical cpu; must be
-                   before changing accumulator to ensure idempotency if
-                   the store faults and the instruction is restarted */
-                gen_op_st_v(s, ot, t0, a0);
-                gen_op_mov_reg_v(ot, R_EAX, t0);
-                tcg_gen_br(label2);
-                gen_set_label(label1);
-                gen_op_st_v(s, ot, t1, a0);
-            }
-            gen_set_label(label2);
-            tcg_gen_mov_tl(cpu_cc_src, t0);
-            tcg_gen_mov_tl(cpu_cc_srcT, t2);
-            tcg_gen_sub_tl(cpu_cc_dst, t2, t0);
+                if (mod == 3) {
+                    rm = (modrm & 7) | REX_B(s);
+                    gen_op_mov_v_reg(ot, oldv, rm);
+                } else {
+                    gen_lea_modrm(env, s, modrm);
+                    gen_op_ld_v(s, ot, oldv, cpu_A0);
+                    rm = 0; /* avoid warning */
+                }
+                gen_extu(ot, oldv);
+                gen_extu(ot, cmpv);
+                /* store value = (old == cmp ? new : old);  */
+                tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);
+                if (mod == 3) {
+                    gen_op_mov_reg_v(ot, R_EAX, oldv);
+                    gen_op_mov_reg_v(ot, rm, newv);
+                } else {
+                    /* Perform an unconditional store cycle like physical cpu;
+                       must be before changing accumulator to ensure
+                       idempotency if the store faults and the instruction
+                       is restarted */
+                    gen_op_st_v(s, ot, newv, cpu_A0);
+                    gen_op_mov_reg_v(ot, R_EAX, oldv);
+                }
+            }
+            tcg_gen_mov_tl(cpu_cc_src, oldv);
+            tcg_gen_mov_tl(cpu_cc_srcT, cmpv);
+            tcg_gen_sub_tl(cpu_cc_dst, cmpv, oldv);
             set_cc_op(s, CC_OP_SUBB + ot);
-            tcg_temp_free(t0);
-            tcg_temp_free(t1);
-            tcg_temp_free(t2);
-            tcg_temp_free(a0);
+            tcg_temp_free(oldv);
+            tcg_temp_free(newv);
+            tcg_temp_free(cmpv);
         }
         break;
     case 0x1c7: /* cmpxchg8b */
@@ -5132,14 +5133,22 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             if (!(s->cpuid_ext_features & CPUID_EXT_CX16))
                 goto illegal_op;
             gen_lea_modrm(env, s, modrm);
-            gen_helper_cmpxchg16b(cpu_env, cpu_A0);
+            if ((s->prefix & PREFIX_LOCK) && parallel_cpus) {
+                gen_helper_cmpxchg16b(cpu_env, cpu_A0);
+            } else {
+                gen_helper_cmpxchg16b_unlocked(cpu_env, cpu_A0);
+            }
         } else
 #endif        
         {
             if (!(s->cpuid_features & CPUID_CX8))
                 goto illegal_op;
             gen_lea_modrm(env, s, modrm);
-            gen_helper_cmpxchg8b(cpu_env, cpu_A0);
+            if ((s->prefix & PREFIX_LOCK) && parallel_cpus) {
+                gen_helper_cmpxchg8b(cpu_env, cpu_A0);
+            } else {
+                gen_helper_cmpxchg8b_unlocked(cpu_env, cpu_A0);
+            }
         }
         set_cc_op(s, CC_OP_EFLAGS);
         break;

From a7cee522f3529c2fc85379237b391ea98823271e Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:01:58 -0400
Subject: [PATCH 20/37] target-i386: emulate LOCK'ed OP instructions using
 atomic helpers

[rth: Eliminate some unnecessary temporaries.]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-13-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/translate.c | 76 +++++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 18 deletions(-)

diff --git a/target-i386/translate.c b/target-i386/translate.c
index dd1b408642e9..b9f58f8e3269 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -1257,55 +1257,95 @@ static void gen_op(DisasContext *s1, int op, TCGMemOp ot, int d)
 {
     if (d != OR_TMP0) {
         gen_op_mov_v_reg(ot, cpu_T0, d);
-    } else {
+    } else if (!(s1->prefix & PREFIX_LOCK)) {
         gen_op_ld_v(s1, ot, cpu_T0, cpu_A0);
     }
     switch(op) {
     case OP_ADCL:
         gen_compute_eflags_c(s1, cpu_tmp4);
-        tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
-        tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_tmp4);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_add_tl(cpu_T0, cpu_tmp4, cpu_T1);
+            tcg_gen_atomic_add_fetch_tl(cpu_T0, cpu_A0, cpu_T0,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
+            tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_tmp4);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
         gen_op_update3_cc(cpu_tmp4);
         set_cc_op(s1, CC_OP_ADCB + ot);
         break;
     case OP_SBBL:
         gen_compute_eflags_c(s1, cpu_tmp4);
-        tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_T1);
-        tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_tmp4);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_add_tl(cpu_T0, cpu_T1, cpu_tmp4);
+            tcg_gen_neg_tl(cpu_T0, cpu_T0);
+            tcg_gen_atomic_add_fetch_tl(cpu_T0, cpu_A0, cpu_T0,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_T1);
+            tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_tmp4);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
         gen_op_update3_cc(cpu_tmp4);
         set_cc_op(s1, CC_OP_SBBB + ot);
         break;
     case OP_ADDL:
-        tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_atomic_add_fetch_tl(cpu_T0, cpu_A0, cpu_T1,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
         gen_op_update2_cc();
         set_cc_op(s1, CC_OP_ADDB + ot);
         break;
     case OP_SUBL:
-        tcg_gen_mov_tl(cpu_cc_srcT, cpu_T0);
-        tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_neg_tl(cpu_T0, cpu_T1);
+            tcg_gen_atomic_fetch_add_tl(cpu_cc_srcT, cpu_A0, cpu_T0,
+                                        s1->mem_index, ot | MO_LE);
+            tcg_gen_sub_tl(cpu_T0, cpu_cc_srcT, cpu_T1);
+        } else {
+            tcg_gen_mov_tl(cpu_cc_srcT, cpu_T0);
+            tcg_gen_sub_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
         gen_op_update2_cc();
         set_cc_op(s1, CC_OP_SUBB + ot);
         break;
     default:
     case OP_ANDL:
-        tcg_gen_and_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_atomic_and_fetch_tl(cpu_T0, cpu_A0, cpu_T1,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_and_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
         gen_op_update1_cc();
         set_cc_op(s1, CC_OP_LOGICB + ot);
         break;
     case OP_ORL:
-        tcg_gen_or_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_atomic_or_fetch_tl(cpu_T0, cpu_A0, cpu_T1,
+                                       s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_or_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
         gen_op_update1_cc();
         set_cc_op(s1, CC_OP_LOGICB + ot);
         break;
     case OP_XORL:
-        tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_T1);
-        gen_op_st_rm_T0_A0(s1, ot, d);
+        if (s1->prefix & PREFIX_LOCK) {
+            tcg_gen_atomic_xor_fetch_tl(cpu_T0, cpu_A0, cpu_T1,
+                                        s1->mem_index, ot | MO_LE);
+        } else {
+            tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_T1);
+            gen_op_st_rm_T0_A0(s1, ot, d);
+        }
         gen_op_update1_cc();
         set_cc_op(s1, CC_OP_LOGICB + ot);
         break;

From 60e573462fcdb83aa1a41e66a9f31dc8a4364399 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:01:59 -0400
Subject: [PATCH 21/37] target-i386: emulate LOCK'ed INC using atomic helper

[rth: Merge gen_inc_locked back into gen_inc to share cc update.]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-14-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/translate.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/target-i386/translate.c b/target-i386/translate.c
index b9f58f8e3269..2f39dac49f9a 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -1361,21 +1361,23 @@ static void gen_op(DisasContext *s1, int op, TCGMemOp ot, int d)
 /* if d == OR_TMP0, it means memory operand (address in A0) */
 static void gen_inc(DisasContext *s1, TCGMemOp ot, int d, int c)
 {
-    if (d != OR_TMP0) {
-        gen_op_mov_v_reg(ot, cpu_T0, d);
+    if (s1->prefix & PREFIX_LOCK) {
+        tcg_gen_movi_tl(cpu_T0, c > 0 ? 1 : -1);
+        tcg_gen_atomic_add_fetch_tl(cpu_T0, cpu_A0, cpu_T0,
+                                    s1->mem_index, ot | MO_LE);
     } else {
-        gen_op_ld_v(s1, ot, cpu_T0, cpu_A0);
+        if (d != OR_TMP0) {
+            gen_op_mov_v_reg(ot, cpu_T0, d);
+        } else {
+            gen_op_ld_v(s1, ot, cpu_T0, cpu_A0);
+        }
+        tcg_gen_addi_tl(cpu_T0, cpu_T0, (c > 0 ? 1 : -1));
+        gen_op_st_rm_T0_A0(s1, ot, d);
     }
+
     gen_compute_eflags_c(s1, cpu_cc_src);
-    if (c > 0) {
-        tcg_gen_addi_tl(cpu_T0, cpu_T0, 1);
-        set_cc_op(s1, CC_OP_INCB + ot);
-    } else {
-        tcg_gen_addi_tl(cpu_T0, cpu_T0, -1);
-        set_cc_op(s1, CC_OP_DECB + ot);
-    }
-    gen_op_st_rm_T0_A0(s1, ot, d);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T0);
+    set_cc_op(s1, (c > 0 ? CC_OP_INCB : CC_OP_DECB) + ot);
 }
 
 static void gen_shift_flags(DisasContext *s, TCGMemOp ot, TCGv result,

From 2a5fe8ae145ef7a3ab480922116d27efcc97b85d Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:00 -0400
Subject: [PATCH 22/37] target-i386: emulate LOCK'ed NOT using atomic helper

[rth: Avoid qemu_load that's redundant with the atomic op.]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-15-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/translate.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/target-i386/translate.c b/target-i386/translate.c
index 2f39dac49f9a..6d71564317ff 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -4674,10 +4674,15 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         rm = (modrm & 7) | REX_B(s);
         op = (modrm >> 3) & 7;
         if (mod != 3) {
-            if (op == 0)
+            if (op == 0) {
                 s->rip_offset = insn_const_size(ot);
+            }
             gen_lea_modrm(env, s, modrm);
-            gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            /* For those below that handle locked memory, don't load here.  */
+            if (!(s->prefix & PREFIX_LOCK)
+                || op != 2) {
+                gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            }
         } else {
             gen_op_mov_v_reg(ot, cpu_T0, rm);
         }
@@ -4690,11 +4695,20 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             set_cc_op(s, CC_OP_LOGICB + ot);
             break;
         case 2: /* not */
-            tcg_gen_not_tl(cpu_T0, cpu_T0);
-            if (mod != 3) {
-                gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+            if (s->prefix & PREFIX_LOCK) {
+                if (mod == 3) {
+                    goto illegal_op;
+                }
+                tcg_gen_movi_tl(cpu_T0, ~0);
+                tcg_gen_atomic_xor_fetch_tl(cpu_T0, cpu_A0, cpu_T0,
+                                            s->mem_index, ot | MO_LE);
             } else {
-                gen_op_mov_reg_v(ot, rm, cpu_T0);
+                tcg_gen_not_tl(cpu_T0, cpu_T0);
+                if (mod != 3) {
+                    gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+                } else {
+                    gen_op_mov_reg_v(ot, rm, cpu_T0);
+                }
             }
             break;
         case 3: /* neg */

From 8eb8c7385608b99bed6055a22d897ff727a6cb8e Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:01 -0400
Subject: [PATCH 23/37] target-i386: emulate LOCK'ed NEG using cmpxchg helper

[rth: Move redundant qemu_load out of cmpxchg loop.]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-16-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/translate.c | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/target-i386/translate.c b/target-i386/translate.c
index 6d71564317ff..d2edbd9b29b1 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -4712,11 +4712,41 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             }
             break;
         case 3: /* neg */
-            tcg_gen_neg_tl(cpu_T0, cpu_T0);
-            if (mod != 3) {
-                gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+            if (s->prefix & PREFIX_LOCK) {
+                TCGLabel *label1;
+                TCGv a0, t0, t1, t2;
+
+                if (mod == 3) {
+                    goto illegal_op;
+                }
+                a0 = tcg_temp_local_new();
+                t0 = tcg_temp_local_new();
+                label1 = gen_new_label();
+
+                tcg_gen_mov_tl(a0, cpu_A0);
+                tcg_gen_mov_tl(t0, cpu_T0);
+
+                gen_set_label(label1);
+                t1 = tcg_temp_new();
+                t2 = tcg_temp_new();
+                tcg_gen_mov_tl(t2, t0);
+                tcg_gen_neg_tl(t1, t0);
+                tcg_gen_atomic_cmpxchg_tl(t0, a0, t0, t1,
+                                          s->mem_index, ot | MO_LE);
+                tcg_temp_free(t1);
+                tcg_gen_brcond_tl(TCG_COND_NE, t0, t2, label1);
+
+                tcg_temp_free(t2);
+                tcg_temp_free(a0);
+                tcg_gen_mov_tl(cpu_T0, t0);
+                tcg_temp_free(t0);
             } else {
-                gen_op_mov_reg_v(ot, rm, cpu_T0);
+                tcg_gen_neg_tl(cpu_T0, cpu_T0);
+                if (mod != 3) {
+                    gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+                } else {
+                    gen_op_mov_reg_v(ot, rm, cpu_T0);
+                }
             }
             gen_op_update_neg_cc();
             set_cc_op(s, CC_OP_SUBB + ot);

From f53b01817f95781d2bcc8a82e057d1416601e13b Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:02 -0400
Subject: [PATCH 24/37] target-i386: emulate LOCK'ed XADD using atomic helper

[rth: Move load of reg value to common location.]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-17-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/translate.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/target-i386/translate.c b/target-i386/translate.c
index d2edbd9b29b1..3e6011b953ba 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -5134,19 +5134,24 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         modrm = cpu_ldub_code(env, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
+        gen_op_mov_v_reg(ot, cpu_T0, reg);
         if (mod == 3) {
             rm = (modrm & 7) | REX_B(s);
-            gen_op_mov_v_reg(ot, cpu_T0, reg);
             gen_op_mov_v_reg(ot, cpu_T1, rm);
             tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
             gen_op_mov_reg_v(ot, reg, cpu_T1);
             gen_op_mov_reg_v(ot, rm, cpu_T0);
         } else {
             gen_lea_modrm(env, s, modrm);
-            gen_op_mov_v_reg(ot, cpu_T0, reg);
-            gen_op_ld_v(s, ot, cpu_T1, cpu_A0);
-            tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
-            gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+            if (s->prefix & PREFIX_LOCK) {
+                tcg_gen_atomic_fetch_add_tl(cpu_T1, cpu_A0, cpu_T0,
+                                            s->mem_index, ot | MO_LE);
+                tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
+            } else {
+                gen_op_ld_v(s, ot, cpu_T1, cpu_A0);
+                tcg_gen_add_tl(cpu_T0, cpu_T0, cpu_T1);
+                gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+            }
             gen_op_mov_reg_v(ot, reg, cpu_T1);
         }
         gen_op_update2_cc();

From cfe819d309d472f75fd129faf1d1064a2498326c Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:03 -0400
Subject: [PATCH 25/37] target-i386: emulate LOCK'ed BTX ops using atomic
 helpers

[rth: Avoid redundant qemu_ld in locked case.  Fix previously unnoticed
incorrect zero-extension of address in register-offset case.]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-18-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/translate.c | 87 +++++++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/target-i386/translate.c b/target-i386/translate.c
index 3e6011b953ba..b3cb4c5f640a 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -6655,7 +6655,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         if (mod != 3) {
             s->rip_offset = 1;
             gen_lea_modrm(env, s, modrm);
-            gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            if (!(s->prefix & PREFIX_LOCK)) {
+                gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            }
         } else {
             gen_op_mov_v_reg(ot, cpu_T0, rm);
         }
@@ -6685,44 +6687,69 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         rm = (modrm & 7) | REX_B(s);
         gen_op_mov_v_reg(MO_32, cpu_T1, reg);
         if (mod != 3) {
-            gen_lea_modrm(env, s, modrm);
+            AddressParts a = gen_lea_modrm_0(env, s, modrm);
             /* specific case: we need to add a displacement */
             gen_exts(ot, cpu_T1);
             tcg_gen_sari_tl(cpu_tmp0, cpu_T1, 3 + ot);
             tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, ot);
-            tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
-            gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            tcg_gen_add_tl(cpu_A0, gen_lea_modrm_1(a), cpu_tmp0);
+            gen_lea_v_seg(s, s->aflag, cpu_A0, a.def_seg, s->override);
+            if (!(s->prefix & PREFIX_LOCK)) {
+                gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+            }
         } else {
             gen_op_mov_v_reg(ot, cpu_T0, rm);
         }
     bt_op:
         tcg_gen_andi_tl(cpu_T1, cpu_T1, (1 << (3 + ot)) - 1);
-        tcg_gen_shr_tl(cpu_tmp4, cpu_T0, cpu_T1);
-        switch(op) {
-        case 0:
-            break;
-        case 1:
-            tcg_gen_movi_tl(cpu_tmp0, 1);
-            tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T1);
-            tcg_gen_or_tl(cpu_T0, cpu_T0, cpu_tmp0);
-            break;
-        case 2:
-            tcg_gen_movi_tl(cpu_tmp0, 1);
-            tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T1);
-            tcg_gen_andc_tl(cpu_T0, cpu_T0, cpu_tmp0);
-            break;
-        default:
-        case 3:
-            tcg_gen_movi_tl(cpu_tmp0, 1);
-            tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T1);
-            tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_tmp0);
-            break;
-        }
-        if (op != 0) {
-            if (mod != 3) {
-                gen_op_st_v(s, ot, cpu_T0, cpu_A0);
-            } else {
-                gen_op_mov_reg_v(ot, rm, cpu_T0);
+        tcg_gen_movi_tl(cpu_tmp0, 1);
+        tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T1);
+        if (s->prefix & PREFIX_LOCK) {
+            switch (op) {
+            case 0: /* bt */
+                /* Needs no atomic ops; we surpressed the normal
+                   memory load for LOCK above so do it now.  */
+                gen_op_ld_v(s, ot, cpu_T0, cpu_A0);
+                break;
+            case 1: /* bts */
+                tcg_gen_atomic_fetch_or_tl(cpu_T0, cpu_A0, cpu_tmp0,
+                                           s->mem_index, ot | MO_LE);
+                break;
+            case 2: /* btr */
+                tcg_gen_not_tl(cpu_tmp0, cpu_tmp0);
+                tcg_gen_atomic_fetch_and_tl(cpu_T0, cpu_A0, cpu_tmp0,
+                                            s->mem_index, ot | MO_LE);
+                break;
+            default:
+            case 3: /* btc */
+                tcg_gen_atomic_fetch_xor_tl(cpu_T0, cpu_A0, cpu_tmp0,
+                                            s->mem_index, ot | MO_LE);
+                break;
+            }
+            tcg_gen_shr_tl(cpu_tmp4, cpu_T0, cpu_T1);
+        } else {
+            tcg_gen_shr_tl(cpu_tmp4, cpu_T0, cpu_T1);
+            switch (op) {
+            case 0: /* bt */
+                /* Data already loaded; nothing to do.  */
+                break;
+            case 1: /* bts */
+                tcg_gen_or_tl(cpu_T0, cpu_T0, cpu_tmp0);
+                break;
+            case 2: /* btr */
+                tcg_gen_andc_tl(cpu_T0, cpu_T0, cpu_tmp0);
+                break;
+            default:
+            case 3: /* btc */
+                tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_tmp0);
+                break;
+            }
+            if (op != 0) {
+                if (mod != 3) {
+                    gen_op_st_v(s, ot, cpu_T0, cpu_A0);
+                } else {
+                    gen_op_mov_reg_v(ot, rm, cpu_T0);
+                }
             }
         }
 

From ea97ebe89f7a879ea9aba90140e40c29b5cbd653 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:04 -0400
Subject: [PATCH 26/37] target-i386: emulate XCHG using atomic helper

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-19-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/translate.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/target-i386/translate.c b/target-i386/translate.c
index b3cb4c5f640a..cfa395654322 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -5564,12 +5564,8 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             gen_lea_modrm(env, s, modrm);
             gen_op_mov_v_reg(ot, cpu_T0, reg);
             /* for xchg, lock is implicit */
-            if (!(prefixes & PREFIX_LOCK))
-                gen_helper_lock();
-            gen_op_ld_v(s, ot, cpu_T1, cpu_A0);
-            gen_op_st_v(s, ot, cpu_T0, cpu_A0);
-            if (!(prefixes & PREFIX_LOCK))
-                gen_helper_unlock();
+            tcg_gen_atomic_xchg_tl(cpu_T1, cpu_A0, cpu_T0,
+                                   s->mem_index, ot | MO_LE);
             gen_op_mov_reg_v(ot, reg, cpu_T1);
         }
         break;

From 37b995f6e7a1cb6fa378c5cd4217b9dd9e1fc98b Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:06 -0400
Subject: [PATCH 27/37] target-i386: remove helper_lock()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's been superseded by the atomic helpers.

The use of the atomic helpers provides a significant performance and scalability
improvement. Below is the result of running the atomic_add-test microbenchmark with:
 $ x86_64-linux-user/qemu-x86_64 tests/atomic_add-bench -o 5000000 -r $r -n $n
, where $n is the number of threads and $r is the allowed range for the additions.

The scenarios measured are:
- atomic: implements x86' ADDL with the atomic_add helper (i.e. this patchset)
- cmpxchg: implement x86' ADDL with a TCG loop using the cmpxchg helper
- master: before this patchset

Results sorted in ascending range, i.e. descending degree of contention.
Y axis is Throughput in Mops/s. Tests are run on an AMD machine with 64
Opteron 6376 cores.

                atomic_add-bench: 5000000 ops/thread, [0,1] range

  25 ++---------+----------+---------+----------+----------+----------+---++
     + atomic +-E--+       +         +          +          +          +    |
     |cmpxchg +-H--+                                                       |
  20 +Emaster +-N--+                                                      ++
     ||                                                                    |
     |++                                                                   |
     ||                                                                    |
  15 +++                                                                  ++
     |N|                                                                   |
     |+|                                                                   |
  10 ++|                                                                  ++
     |+|+                                                                  |
     | |    -+E+------        +++  ---+E+------+E+------+E+-----+E+------+E|
     |+E+E+- +++     +E+------+E+--                                        |
   5 ++|+                                                                 ++
     |+N+H+---                                 +++                         |
     ++++N+--+H++----+++   +  +++  --++H+------+H+------+H++----+H+---+--- |
   0 ++---------+-----H----+---H-----+----------+----------+----------+---H+
     0          10         20        30         40         50         60
                                Number of threads

                atomic_add-bench: 5000000 ops/thread, [0,2] range

  25 ++---------+----------+---------+----------+----------+----------+---++
     ++atomic +-E--+       +         +          +          +          +    |
     |cmpxchg +-H--+                                                       |
  20 ++master +-N--+                                                      ++
     |E|                                                                   |
     |++                                                                   |
     ||E                                                                   |
  15 ++|                                                                  ++
     |N||                                                                  |
     |+||                                   ---+E+------+E+-----+E+------+E|
  10 ++| |        ---+E+------+E+-----+E+---                    +++      +++
     ||H+E+--+E+--                                                         |
     |+++++                                                                |
     | ||                                                                  |
   5 ++|+H+--                                  +++                        ++
     |+N+    -                              ---+H+------+H+------          |
     +  +N+--+H++----+H+---+--+H+----++H+---    +          +    +H+---+--+H|
   0 ++---------+----------+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

                atomic_add-bench: 5000000 ops/thread, [0,8] range

  40 ++---------+----------+---------+----------+----------+----------+---++
     ++atomic +-E--+       +         +          +          +          +    |
  35 +cmpxchg +-H--+                                                      ++
     | master +-N--+               ---+E+------+E+------+E+-----+E+------+E|
  30 ++|                   ---+E+--   +++                                 ++
     | |            -+E+---                                                |
  25 ++E        ---- +++                                                  ++
     |+++++ -+E+                                                           |
  20 +E+ E-- +++                                                          ++
     |H|+++                                                                |
     |+|                                       +H+-------                  |
  15 ++H+                                   ---+++      +H+------         ++
     |N++H+--                         +++---                    +H+------++|
  10 ++ +++  -       +++           ---+H+                       +++      +H+
     | |     +H+-----+H+------+H+--                                        |
   5 ++|                      +++                                         ++
     ++N+N+--+N++          +         +          +          +          +    |
   0 ++---------+----------+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

               atomic_add-bench: 5000000 ops/thread, [0,128] range

  160 ++---------+---------+----------+---------+----------+----------+---++
      + atomic +-E--+      +          +         +          +          +    |
  140 +cmpxchg +-H--+                          +++      +++               ++
      | master +-N--+                           E--------E------+E+------++|
  120 ++                                      --|        |      +++       E+
      |                                     -- +++      +++              ++|
  100 ++                                   -                              ++
      |                                +++-                     +++      ++|
   80 ++                              -+E+    -+H+------+H+------H--------++
      |                           ----    ----                  +++       H|
      |            ---+E+-----+E+-  ---+H+                               ++|
   60 ++     +E+---   +++  ---+H+---                                      ++
      |    --+++   ---+H+--                                                |
   40 ++ +E+-+H+---                                                       ++
      |  +H+                                                               |
   20 +EE+                                                                ++
      +N+        +         +          +         +          +          +    |
    0 ++N-N---N--+---------+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

              atomic_add-bench: 5000000 ops/thread, [0,1024] range

  350 ++---------+---------+----------+---------+----------+----------+---++
      + atomic +-E--+      +          +         +          +          +    |
  300 +cmpxchg +-H--+                                                    +++
      | master +-N--+                                           +++       ||
      |                                                 +++      |    ----E|
  250 ++                                                 |   ----E----    ++
      |                                              ----E---    |    ---+H|
  200 ++                                      -+E+---   +++  ---+H+---    ++
      |                                   ----         -+H+--              |
      |                                +E+     +++ ---- +++                |
  150 ++                            ---+++  ---+H+-                       ++
      |                          ---  -+H+--                               |
  100 ++                   ---+E+ ---- +++                                ++
      |      +++   ---+E+-----+H+-                                         |
      |     -+E+------+H+--                                                |
   50 ++ +E+                                                              ++
      +EE+       +         +          +         +          +          +    |
    0 ++N-N---N--+---------+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

  hi-res: http://imgur.com/a/fMRmq

For master I stopped measuring master after 8 threads, because there is little
point in measuring the well-known performance collapse of a contended lock.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-21-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/helper.h     |  2 --
 target-i386/mem_helper.c | 33 ---------------------------------
 target-i386/translate.c  | 15 ---------------
 3 files changed, 50 deletions(-)

diff --git a/target-i386/helper.h b/target-i386/helper.h
index 729d4b63da57..4e859eba9d46 100644
--- a/target-i386/helper.h
+++ b/target-i386/helper.h
@@ -1,8 +1,6 @@
 DEF_HELPER_FLAGS_4(cc_compute_all, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)
 DEF_HELPER_FLAGS_4(cc_compute_c, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)
 
-DEF_HELPER_0(lock, void)
-DEF_HELPER_0(unlock, void)
 DEF_HELPER_3(write_eflags, void, env, tl, i32)
 DEF_HELPER_1(read_eflags, tl, env)
 DEF_HELPER_2(divb_AL, void, env, tl)
diff --git a/target-i386/mem_helper.c b/target-i386/mem_helper.c
index c4b5c5bd94c7..70f67668ab26 100644
--- a/target-i386/mem_helper.c
+++ b/target-i386/mem_helper.c
@@ -25,39 +25,6 @@
 #include "qemu/int128.h"
 #include "tcg.h"
 
-/* broken thread support */
-
-#if defined(CONFIG_USER_ONLY)
-QemuMutex global_cpu_lock;
-
-void helper_lock(void)
-{
-    qemu_mutex_lock(&global_cpu_lock);
-}
-
-void helper_unlock(void)
-{
-    qemu_mutex_unlock(&global_cpu_lock);
-}
-
-void helper_lock_init(void)
-{
-    qemu_mutex_init(&global_cpu_lock);
-}
-#else
-void helper_lock(void)
-{
-}
-
-void helper_unlock(void)
-{
-}
-
-void helper_lock_init(void)
-{
-}
-#endif
-
 void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
 {
     uintptr_t ra = GETPC();
diff --git a/target-i386/translate.c b/target-i386/translate.c
index cfa395654322..927b3665341f 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -4536,10 +4536,6 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     s->aflag = aflag;
     s->dflag = dflag;
 
-    /* lock generation */
-    if (prefixes & PREFIX_LOCK)
-        gen_helper_lock();
-
     /* now check op code */
  reswitch:
     switch(b) {
@@ -8211,20 +8207,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     default:
         goto unknown_op;
     }
-    /* lock generation */
-    if (s->prefix & PREFIX_LOCK)
-        gen_helper_unlock();
     return s->pc;
  illegal_op:
-    if (s->prefix & PREFIX_LOCK)
-        gen_helper_unlock();
-    /* XXX: ensure that no lock was generated */
     gen_illegal_opcode(s);
     return s->pc;
  unknown_op:
-    if (s->prefix & PREFIX_LOCK)
-        gen_helper_unlock();
-    /* XXX: ensure that no lock was generated */
     gen_unknown_opcode(env, s);
     return s->pc;
 }
@@ -8316,8 +8303,6 @@ void tcg_x86_init(void)
                                      offsetof(CPUX86State, bnd_regs[i].ub),
                                      bnd_regu_names[i]);
     }
-
-    helper_lock_init();
 }
 
 /* generate intermediate code for basic block 'tb'.  */

From 070e3edceaa023109bfa7a1c5c259342e0b6b625 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:05 -0400
Subject: [PATCH 28/37] tests: add atomic_add-bench

With this microbenchmark we can measure the overhead of emulating atomic
instructions with a configurable degree of contention.

The benchmark spawns $n threads, each performing $o atomic ops (additions)
in a loop. Each atomic operation is performed on a different cache line
(assuming lines are 64b long) that is randomly selected from a range [0, $r).

[ Note: each $foo corresponds to a -foo flag ]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1467054136-10430-20-git-send-email-cota@braap.org>
---
 tests/.gitignore         |   1 +
 tests/Makefile.include   |   4 +-
 tests/atomic_add-bench.c | 163 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 tests/atomic_add-bench.c

diff --git a/tests/.gitignore b/tests/.gitignore
index 4aec8bc5fa9f..64e050e8590c 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,3 +1,4 @@
+atomic_add-bench
 check-qdict
 check-qfloat
 check-qint
diff --git a/tests/Makefile.include b/tests/Makefile.include
index cd058efc14de..22656eaf2691 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -460,7 +460,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o tests/check-qdict.o \
 	tests/test-opts-visitor.o tests/test-qmp-event.o \
 	tests/rcutorture.o tests/test-rcu-list.o \
 	tests/test-qdist.o \
-	tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o
+	tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \
+	tests/atomic_add-bench.o
 
 $(test-obj-y): QEMU_INCLUDES += -Itests
 QEMU_CFLAGS += -I$(SRC_PATH)/tests
@@ -507,6 +508,7 @@ tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y)
 tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) $(test-util-obj-y)
 tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
 tests/test-bufferiszero$(EXESUF): tests/test-bufferiszero.o $(test-util-obj-y)
+tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o $(test-util-obj-y)
 
 tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
 	hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
diff --git a/tests/atomic_add-bench.c b/tests/atomic_add-bench.c
new file mode 100644
index 000000000000..caa1e8e6899b
--- /dev/null
+++ b/tests/atomic_add-bench.c
@@ -0,0 +1,163 @@
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "qemu/host-utils.h"
+#include "qemu/processor.h"
+
+struct thread_info {
+    uint64_t r;
+} QEMU_ALIGNED(64);
+
+struct count {
+    unsigned long val;
+} QEMU_ALIGNED(64);
+
+static QemuThread *threads;
+static struct thread_info *th_info;
+static unsigned int n_threads = 1;
+static unsigned int n_ready_threads;
+static struct count *counts;
+static unsigned int duration = 1;
+static unsigned int range = 1024;
+static bool test_start;
+static bool test_stop;
+
+static const char commands_string[] =
+    " -n = number of threads\n"
+    " -d = duration in seconds\n"
+    " -r = range (will be rounded up to pow2)";
+
+static void usage_complete(char *argv[])
+{
+    fprintf(stderr, "Usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "options:\n%s\n", commands_string);
+}
+
+/*
+ * From: https://en.wikipedia.org/wiki/Xorshift
+ * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
+ * guaranteed to be >= INT_MAX).
+ */
+static uint64_t xorshift64star(uint64_t x)
+{
+    x ^= x >> 12; /* a */
+    x ^= x << 25; /* b */
+    x ^= x >> 27; /* c */
+    return x * UINT64_C(2685821657736338717);
+}
+
+static void *thread_func(void *arg)
+{
+    struct thread_info *info = arg;
+
+    atomic_inc(&n_ready_threads);
+    while (!atomic_read(&test_start)) {
+        cpu_relax();
+    }
+
+    while (!atomic_read(&test_stop)) {
+        unsigned int index;
+
+        info->r = xorshift64star(info->r);
+        index = info->r & (range - 1);
+        atomic_inc(&counts[index].val);
+    }
+    return NULL;
+}
+
+static void run_test(void)
+{
+    unsigned int remaining;
+    unsigned int i;
+
+    while (atomic_read(&n_ready_threads) != n_threads) {
+        cpu_relax();
+    }
+    atomic_set(&test_start, true);
+    do {
+        remaining = sleep(duration);
+    } while (remaining);
+    atomic_set(&test_stop, true);
+
+    for (i = 0; i < n_threads; i++) {
+        qemu_thread_join(&threads[i]);
+    }
+}
+
+static void create_threads(void)
+{
+    unsigned int i;
+
+    threads = g_new(QemuThread, n_threads);
+    th_info = g_new(struct thread_info, n_threads);
+    counts = qemu_memalign(64, sizeof(*counts) * range);
+    memset(counts, 0, sizeof(*counts) * range);
+
+    for (i = 0; i < n_threads; i++) {
+        struct thread_info *info = &th_info[i];
+
+        info->r = (i + 1) ^ time(NULL);
+        qemu_thread_create(&threads[i], NULL, thread_func, info,
+                           QEMU_THREAD_JOINABLE);
+    }
+}
+
+static void pr_params(void)
+{
+    printf("Parameters:\n");
+    printf(" # of threads:      %u\n", n_threads);
+    printf(" duration:          %u\n", duration);
+    printf(" ops' range:        %u\n", range);
+}
+
+static void pr_stats(void)
+{
+    unsigned long long val = 0;
+    unsigned int i;
+    double tx;
+
+    for (i = 0; i < range; i++) {
+        val += counts[i].val;
+    }
+    tx = val / duration / 1e6;
+
+    printf("Results:\n");
+    printf("Duration:            %u s\n", duration);
+    printf(" Throughput:         %.2f Mops/s\n", tx);
+    printf(" Throughput/thread:  %.2f Mops/s/thread\n", tx / n_threads);
+}
+
+static void parse_args(int argc, char *argv[])
+{
+    int c;
+
+    for (;;) {
+        c = getopt(argc, argv, "hd:n:r:");
+        if (c < 0) {
+            break;
+        }
+        switch (c) {
+        case 'h':
+            usage_complete(argv);
+            exit(0);
+        case 'd':
+            duration = atoi(optarg);
+            break;
+        case 'n':
+            n_threads = atoi(optarg);
+            break;
+        case 'r':
+            range = pow2ceil(atoi(optarg));
+            break;
+        }
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    parse_args(argc, argv);
+    pr_params();
+    create_threads();
+    run_test();
+    pr_stats();
+    return 0;
+}

From 7f5616f53896a4e08ad37de3ac50d3a4cc8eff7a Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Thu, 30 Jun 2016 11:44:14 -0700
Subject: [PATCH 29/37] target-arm: Rearrange aa32 load and store functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stop specializing on TARGET_LONG_BITS == 32; unconditionally allocate
a temp and expand with tcg_gen_extu_i32_tl.  Split out gen_aa32_addr,
gen_aa32_frob64, gen_aa32_ld_i32 and gen_aa32_st_i32 as separate interfaces.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-arm/translate.c | 171 ++++++++++++++++-------------------------
 1 file changed, 66 insertions(+), 105 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index ef62f8b0c430..ee2bf57f90f1 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -932,145 +932,106 @@ static inline void store_reg_from_load(DisasContext *s, int reg, TCGv_i32 var)
  * These functions work like tcg_gen_qemu_{ld,st}* except
  * that the address argument is TCGv_i32 rather than TCGv.
  */
-#if TARGET_LONG_BITS == 32
 
-#define DO_GEN_LD(SUFF, OPC, BE32_XOR)                                   \
-static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val,      \
-                                     TCGv_i32 addr, int index)           \
-{                                                                        \
-    TCGMemOp opc = (OPC) | s->be_data;                                   \
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */    \
-    if (!IS_USER_ONLY && s->sctlr_b && BE32_XOR) {                       \
-        TCGv addr_be = tcg_temp_new();                                   \
-        tcg_gen_xori_i32(addr_be, addr, BE32_XOR);                       \
-        tcg_gen_qemu_ld_i32(val, addr_be, index, opc);                   \
-        tcg_temp_free(addr_be);                                          \
-        return;                                                          \
-    }                                                                    \
-    tcg_gen_qemu_ld_i32(val, addr, index, opc);                          \
-}
-
-#define DO_GEN_ST(SUFF, OPC, BE32_XOR)                                   \
-static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val,      \
-                                     TCGv_i32 addr, int index)           \
-{                                                                        \
-    TCGMemOp opc = (OPC) | s->be_data;                                   \
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */    \
-    if (!IS_USER_ONLY && s->sctlr_b && BE32_XOR) {                       \
-        TCGv addr_be = tcg_temp_new();                                   \
-        tcg_gen_xori_i32(addr_be, addr, BE32_XOR);                       \
-        tcg_gen_qemu_st_i32(val, addr_be, index, opc);                   \
-        tcg_temp_free(addr_be);                                          \
-        return;                                                          \
-    }                                                                    \
-    tcg_gen_qemu_st_i32(val, addr, index, opc);                          \
-}
-
-static inline void gen_aa32_ld64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 addr, int index)
+static inline TCGv gen_aa32_addr(DisasContext *s, TCGv_i32 a32, TCGMemOp op)
 {
-    TCGMemOp opc = MO_Q | s->be_data;
-    tcg_gen_qemu_ld_i64(val, addr, index, opc);
+    TCGv addr = tcg_temp_new();
+    tcg_gen_extu_i32_tl(addr, a32);
+
     /* Not needed for user-mode BE32, where we use MO_BE instead.  */
-    if (!IS_USER_ONLY && s->sctlr_b) {
-        tcg_gen_rotri_i64(val, val, 32);
+    if (!IS_USER_ONLY && s->sctlr_b && (op & MO_SIZE) < MO_32) {
+        tcg_gen_xori_tl(addr, addr, 4 - (1 << (op & MO_SIZE)));
     }
+    return addr;
 }
 
-static inline void gen_aa32_st64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 addr, int index)
+static void gen_aa32_ld_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                            int index, TCGMemOp opc)
 {
-    TCGMemOp opc = MO_Q | s->be_data;
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
-    if (!IS_USER_ONLY && s->sctlr_b) {
-        TCGv_i64 tmp = tcg_temp_new_i64();
-        tcg_gen_rotri_i64(tmp, val, 32);
-        tcg_gen_qemu_st_i64(tmp, addr, index, opc);
-        tcg_temp_free_i64(tmp);
-        return;
-    }
-    tcg_gen_qemu_st_i64(val, addr, index, opc);
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+    tcg_gen_qemu_ld_i32(val, addr, index, opc);
+    tcg_temp_free(addr);
 }
 
-#else
+static void gen_aa32_st_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                            int index, TCGMemOp opc)
+{
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+    tcg_gen_qemu_st_i32(val, addr, index, opc);
+    tcg_temp_free(addr);
+}
 
-#define DO_GEN_LD(SUFF, OPC, BE32_XOR)                                   \
+#define DO_GEN_LD(SUFF, OPC)                                             \
 static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val,      \
-                                     TCGv_i32 addr, int index)           \
+                                     TCGv_i32 a32, int index)            \
 {                                                                        \
-    TCGMemOp opc = (OPC) | s->be_data;                                   \
-    TCGv addr64 = tcg_temp_new();                                        \
-    tcg_gen_extu_i32_i64(addr64, addr);                                  \
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */    \
-    if (!IS_USER_ONLY && s->sctlr_b && BE32_XOR) {                       \
-        tcg_gen_xori_i64(addr64, addr64, BE32_XOR);                      \
-    }                                                                    \
-    tcg_gen_qemu_ld_i32(val, addr64, index, opc);                        \
-    tcg_temp_free(addr64);                                               \
-}
-
-#define DO_GEN_ST(SUFF, OPC, BE32_XOR)                                   \
+    gen_aa32_ld_i32(s, val, a32, index, OPC | s->be_data);               \
+}
+
+#define DO_GEN_ST(SUFF, OPC)                                             \
 static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val,      \
-                                     TCGv_i32 addr, int index)           \
+                                     TCGv_i32 a32, int index)            \
 {                                                                        \
-    TCGMemOp opc = (OPC) | s->be_data;                                   \
-    TCGv addr64 = tcg_temp_new();                                        \
-    tcg_gen_extu_i32_i64(addr64, addr);                                  \
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */    \
-    if (!IS_USER_ONLY && s->sctlr_b && BE32_XOR) {                       \
-        tcg_gen_xori_i64(addr64, addr64, BE32_XOR);                      \
-    }                                                                    \
-    tcg_gen_qemu_st_i32(val, addr64, index, opc);                        \
-    tcg_temp_free(addr64);                                               \
+    gen_aa32_st_i32(s, val, a32, index, OPC | s->be_data);               \
 }
 
-static inline void gen_aa32_ld64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 addr, int index)
+static inline void gen_aa32_frob64(DisasContext *s, TCGv_i64 val)
 {
-    TCGMemOp opc = MO_Q | s->be_data;
-    TCGv addr64 = tcg_temp_new();
-    tcg_gen_extu_i32_i64(addr64, addr);
-    tcg_gen_qemu_ld_i64(val, addr64, index, opc);
-
     /* Not needed for user-mode BE32, where we use MO_BE instead.  */
     if (!IS_USER_ONLY && s->sctlr_b) {
         tcg_gen_rotri_i64(val, val, 32);
     }
-    tcg_temp_free(addr64);
 }
 
-static inline void gen_aa32_st64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 addr, int index)
+static void gen_aa32_ld_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                            int index, TCGMemOp opc)
 {
-    TCGMemOp opc = MO_Q | s->be_data;
-    TCGv addr64 = tcg_temp_new();
-    tcg_gen_extu_i32_i64(addr64, addr);
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+    tcg_gen_qemu_ld_i64(val, addr, index, opc);
+    gen_aa32_frob64(s, val);
+    tcg_temp_free(addr);
+}
+
+static inline void gen_aa32_ld64(DisasContext *s, TCGv_i64 val,
+                                 TCGv_i32 a32, int index)
+{
+    gen_aa32_ld_i64(s, val, a32, index, MO_Q | s->be_data);
+}
+
+static void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                            int index, TCGMemOp opc)
+{
+    TCGv addr = gen_aa32_addr(s, a32, opc);
 
     /* Not needed for user-mode BE32, where we use MO_BE instead.  */
     if (!IS_USER_ONLY && s->sctlr_b) {
-        TCGv tmp = tcg_temp_new();
+        TCGv_i64 tmp = tcg_temp_new_i64();
         tcg_gen_rotri_i64(tmp, val, 32);
-        tcg_gen_qemu_st_i64(tmp, addr64, index, opc);
-        tcg_temp_free(tmp);
+        tcg_gen_qemu_st_i64(tmp, addr, index, opc);
+        tcg_temp_free_i64(tmp);
     } else {
-        tcg_gen_qemu_st_i64(val, addr64, index, opc);
+        tcg_gen_qemu_st_i64(val, addr, index, opc);
     }
-    tcg_temp_free(addr64);
+    tcg_temp_free(addr);
 }
 
-#endif
+static inline void gen_aa32_st64(DisasContext *s, TCGv_i64 val,
+                                 TCGv_i32 a32, int index)
+{
+    gen_aa32_st_i64(s, val, a32, index, MO_Q | s->be_data);
+}
 
-DO_GEN_LD(8s, MO_SB, 3)
-DO_GEN_LD(8u, MO_UB, 3)
-DO_GEN_LD(16s, MO_SW, 2)
-DO_GEN_LD(16u, MO_UW, 2)
-DO_GEN_LD(32u, MO_UL, 0)
+DO_GEN_LD(8s, MO_SB)
+DO_GEN_LD(8u, MO_UB)
+DO_GEN_LD(16s, MO_SW)
+DO_GEN_LD(16u, MO_UW)
+DO_GEN_LD(32u, MO_UL)
 /* 'a' variants include an alignment check */
-DO_GEN_LD(16ua, MO_UW | MO_ALIGN, 2)
-DO_GEN_LD(32ua, MO_UL | MO_ALIGN, 0)
-DO_GEN_ST(8, MO_UB, 3)
-DO_GEN_ST(16, MO_UW, 2)
-DO_GEN_ST(32, MO_UL, 0)
+DO_GEN_LD(16ua, MO_UW | MO_ALIGN)
+DO_GEN_LD(32ua, MO_UL | MO_ALIGN)
+DO_GEN_ST(8, MO_UB)
+DO_GEN_ST(16, MO_UW)
+DO_GEN_ST(32, MO_UL)
 
 static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
 {

From 354161b37c6465a32073eac5f16fa35939af2bb4 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:08 -0400
Subject: [PATCH 30/37] target-arm: emulate LL/SC using cmpxchg helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.

The appended emulates LL/SC pairs in ARM with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.

Hi-res plots: http://imgur.com/a/aNQpB

               atomic_add-bench: 1000000 ops/thread, [0,1] range

  9 ++---------+----------+----------+----------+----------+----------+---++
    +cmpxchg +-E--+       +          +          +          +          +    |
  8 +Emaster +-H--+                                                       ++
    | |                                                                    |
  7 ++E                                                                   ++
    | |                                                                    |
  6 ++++                                                                  ++
    |  |                                                                   |
  5 ++ |                                                                  ++
  4 ++ |                                                                  ++
    |  |                                                                   |
  3 ++ |                                                                  ++
    |   |                                                                  |
  2 ++  |                                                                 ++
    |H++E+---                                  +++  ---+E+------+E+------+E|
  1 +++     +E+-----+E+------+E+------+E+------+E+--   +++      +++       ++
    ++H+       +    +++   +  +++     ++++       +          +          +    |
  0 ++--H----H-+-----H----+----------+----------+----------+----------+---++
    0          10         20         30         40         50         60
                               Number of threads

                atomic_add-bench: 1000000 ops/thread, [0,2] range

  16 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  14 ++master +-H--+                                                      ++
     | |                                                                   |
  12 ++|                                                                  ++
     | E                                                                   |
  10 ++|                                                                  ++
     | |                                                                   |
   8 ++++                                                                 ++
     |E+|                                                                  |
     |  |                                                                  |
   6 ++ |                                                                 ++
     |   |                                                                 |
   4 ++  |                                                                ++
     |  +E+---       +++      +++              +++           ---+E+------+E|
   2 +H+     +E+------E-------+E+-----+E+------+E+------+E+--            +++
     + |        +    +++   +         ++++       +          +          +    |
   0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

               atomic_add-bench: 1000000 ops/thread, [0,128] range

  70 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +       ++++          +    |
  60 ++master +-H--+                                 ----E------+E+-------++
     |                                        -+E+---   +++     +++      +E|
     |                                +++ ---- +++                       ++|
  50 ++                       +++  ---+E+-                                ++
     |                        -E---                                        |
  40 ++                    ---+++                                         ++
     |               +++---                                                |
     |              -+E+                                                   |
  30 ++      +++----                                                      ++
     |       +E+                                                           |
  20 ++ +++--                                                             ++
     |  +E+                                                                |
     |+E+                                                                  |
  10 +E+                                                                  ++
     +          +          +         +          +          +          +    |
   0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

              atomic_add-bench: 1000000 ops/thread, [0,1024] range

  120 ++---------+---------+----------+---------+----------+----------+---++
      +cmpxchg +-E--+      +          +         +          +          +    |
      | master +-H--+                                                    ++|
  100 ++                                                              ----E+
      |                                                 +++  ---+E+---   ++|
      |                                                --E---   +++        |
   80 ++                                           ---- +++               ++
      |                                     ---+E+-                        |
   60 ++                              -+E+--                              ++
      |                       +++ ---- +++                                 |
      |                      -+E+-                                         |
   40 ++              +++----                                             ++
      |      +++   ---+E+                                                  |
      |     -+E+---                                                        |
   20 ++ +E+                                                              ++
      |+E+++                                                               |
      +E+        +         +          +         +          +          +    |
    0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

[rth: Enforce alignment for ldrexd.]

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-23-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-arm/translate.c | 140 +++++++++++++----------------------------
 1 file changed, 45 insertions(+), 95 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index ee2bf57f90f1..cc97d1ed468e 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -1026,9 +1026,6 @@ DO_GEN_LD(8u, MO_UB)
 DO_GEN_LD(16s, MO_SW)
 DO_GEN_LD(16u, MO_UW)
 DO_GEN_LD(32u, MO_UL)
-/* 'a' variants include an alignment check */
-DO_GEN_LD(16ua, MO_UW | MO_ALIGN)
-DO_GEN_LD(32ua, MO_UL | MO_ALIGN)
 DO_GEN_ST(8, MO_UB)
 DO_GEN_ST(16, MO_UW)
 DO_GEN_ST(32, MO_UL)
@@ -7720,45 +7717,30 @@ static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi)
 
 /* Load/Store exclusive instructions are implemented by remembering
    the value/address loaded, and seeing if these are the same
-   when the store is performed. This should be sufficient to implement
+   when the store is performed.  This should be sufficient to implement
    the architecturally mandated semantics, and avoids having to monitor
-   regular stores.
-
-   In system emulation mode only one CPU will be running at once, so
-   this sequence is effectively atomic.  In user emulation mode we
-   throw an exception and handle the atomic operation elsewhere.  */
+   regular stores.  The compare vs the remembered value is done during
+   the cmpxchg operation, but we must compare the addresses manually.  */
 static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
                                TCGv_i32 addr, int size)
 {
     TCGv_i32 tmp = tcg_temp_new_i32();
+    TCGMemOp opc = size | MO_ALIGN | s->be_data;
 
     s->is_ldex = true;
 
-    switch (size) {
-    case 0:
-        gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
-        break;
-    case 1:
-        gen_aa32_ld16ua(s, tmp, addr, get_mem_index(s));
-        break;
-    case 2:
-    case 3:
-        gen_aa32_ld32ua(s, tmp, addr, get_mem_index(s));
-        break;
-    default:
-        abort();
-    }
-
     if (size == 3) {
         TCGv_i32 tmp2 = tcg_temp_new_i32();
-        TCGv_i32 tmp3 = tcg_temp_new_i32();
+        TCGv_i64 t64 = tcg_temp_new_i64();
 
-        tcg_gen_addi_i32(tmp2, addr, 4);
-        gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s));
-        tcg_temp_free_i32(tmp2);
-        tcg_gen_concat_i32_i64(cpu_exclusive_val, tmp, tmp3);
-        store_reg(s, rt2, tmp3);
+        gen_aa32_ld_i64(s, t64, addr, get_mem_index(s), opc);
+        tcg_gen_mov_i64(cpu_exclusive_val, t64);
+        tcg_gen_extr_i64_i32(tmp, tmp2, t64);
+        tcg_temp_free_i64(t64);
+
+        store_reg(s, rt2, tmp2);
     } else {
+        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc);
         tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp);
     }
 
@@ -7771,23 +7753,15 @@ static void gen_clrex(DisasContext *s)
     tcg_gen_movi_i64(cpu_exclusive_addr, -1);
 }
 
-#ifdef CONFIG_USER_ONLY
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
                                 TCGv_i32 addr, int size)
 {
-    tcg_gen_extu_i32_i64(cpu_exclusive_test, addr);
-    tcg_gen_movi_i32(cpu_exclusive_info,
-                     size | (rd << 4) | (rt << 8) | (rt2 << 12));
-    gen_exception_internal_insn(s, 4, EXCP_STREX);
-}
-#else
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv_i32 addr, int size)
-{
-    TCGv_i32 tmp;
-    TCGv_i64 val64, extaddr;
+    TCGv_i32 t0, t1, t2;
+    TCGv_i64 extaddr;
+    TCGv taddr;
     TCGLabel *done_label;
     TCGLabel *fail_label;
+    TCGMemOp opc = size | MO_ALIGN | s->be_data;
 
     /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) {
          [addr] = {Rt};
@@ -7802,69 +7776,45 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
     tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
     tcg_temp_free_i64(extaddr);
 
-    tmp = tcg_temp_new_i32();
-    switch (size) {
-    case 0:
-        gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
-        break;
-    case 1:
-        gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
-        break;
-    case 2:
-    case 3:
-        gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
-        break;
-    default:
-        abort();
-    }
-
-    val64 = tcg_temp_new_i64();
+    taddr = gen_aa32_addr(s, addr, opc);
+    t0 = tcg_temp_new_i32();
+    t1 = load_reg(s, rt);
     if (size == 3) {
-        TCGv_i32 tmp2 = tcg_temp_new_i32();
-        TCGv_i32 tmp3 = tcg_temp_new_i32();
-        tcg_gen_addi_i32(tmp2, addr, 4);
-        gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s));
-        tcg_temp_free_i32(tmp2);
-        tcg_gen_concat_i32_i64(val64, tmp, tmp3);
-        tcg_temp_free_i32(tmp3);
-    } else {
-        tcg_gen_extu_i32_i64(val64, tmp);
-    }
-    tcg_temp_free_i32(tmp);
+        TCGv_i64 o64 = tcg_temp_new_i64();
+        TCGv_i64 n64 = tcg_temp_new_i64();
 
-    tcg_gen_brcond_i64(TCG_COND_NE, val64, cpu_exclusive_val, fail_label);
-    tcg_temp_free_i64(val64);
+        t2 = load_reg(s, rt2);
+        tcg_gen_concat_i32_i64(n64, t1, t2);
+        tcg_temp_free_i32(t2);
+        gen_aa32_frob64(s, n64);
 
-    tmp = load_reg(s, rt);
-    switch (size) {
-    case 0:
-        gen_aa32_st8(s, tmp, addr, get_mem_index(s));
-        break;
-    case 1:
-        gen_aa32_st16(s, tmp, addr, get_mem_index(s));
-        break;
-    case 2:
-    case 3:
-        gen_aa32_st32(s, tmp, addr, get_mem_index(s));
-        break;
-    default:
-        abort();
-    }
-    tcg_temp_free_i32(tmp);
-    if (size == 3) {
-        tcg_gen_addi_i32(addr, addr, 4);
-        tmp = load_reg(s, rt2);
-        gen_aa32_st32(s, tmp, addr, get_mem_index(s));
-        tcg_temp_free_i32(tmp);
+        tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64,
+                                   get_mem_index(s), opc);
+        tcg_temp_free_i64(n64);
+
+        gen_aa32_frob64(s, o64);
+        tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val);
+        tcg_gen_extrl_i64_i32(t0, o64);
+
+        tcg_temp_free_i64(o64);
+    } else {
+        t2 = tcg_temp_new_i32();
+        tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val);
+        tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc);
+        tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2);
+        tcg_temp_free_i32(t2);
     }
-    tcg_gen_movi_i32(cpu_R[rd], 0);
+    tcg_temp_free_i32(t1);
+    tcg_temp_free(taddr);
+    tcg_gen_mov_i32(cpu_R[rd], t0);
+    tcg_temp_free_i32(t0);
     tcg_gen_br(done_label);
+
     gen_set_label(fail_label);
     tcg_gen_movi_i32(cpu_R[rd], 1);
     gen_set_label(done_label);
     tcg_gen_movi_i64(cpu_exclusive_addr, -1);
 }
-#endif
 
 /* gen_srs:
  * @env: CPUARMState

From cf12bce088f22b92bf62ffa0d7f6a3e951e355a9 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:10 -0400
Subject: [PATCH 31/37] target-arm: emulate SWP with atomic_xchg helper

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-25-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-arm/translate.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index cc97d1ed468e..7c32c9462aef 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -8789,25 +8789,27 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
                         }
                         tcg_temp_free_i32(addr);
                     } else {
+                        TCGv taddr;
+                        TCGMemOp opc = s->be_data;
+
                         /* SWP instruction */
                         rm = (insn) & 0xf;
 
-                        /* ??? This is not really atomic.  However we know
-                           we never have multiple CPUs running in parallel,
-                           so it is good enough.  */
-                        addr = load_reg(s, rn);
-                        tmp = load_reg(s, rm);
-                        tmp2 = tcg_temp_new_i32();
                         if (insn & (1 << 22)) {
-                            gen_aa32_ld8u(s, tmp2, addr, get_mem_index(s));
-                            gen_aa32_st8(s, tmp, addr, get_mem_index(s));
+                            opc |= MO_UB;
                         } else {
-                            gen_aa32_ld32u(s, tmp2, addr, get_mem_index(s));
-                            gen_aa32_st32(s, tmp, addr, get_mem_index(s));
+                            opc |= MO_UL | MO_ALIGN;
                         }
-                        tcg_temp_free_i32(tmp);
+
+                        addr = load_reg(s, rn);
+                        taddr = gen_aa32_addr(s, addr, opc);
                         tcg_temp_free_i32(addr);
-                        store_reg(s, rd, tmp2);
+
+                        tmp = load_reg(s, rm);
+                        tcg_gen_atomic_xchg_i32(tmp, taddr, tmp,
+                                                get_mem_index(s), opc);
+                        tcg_temp_free(taddr);
+                        store_reg(s, rd, tmp);
                     }
                 }
             } else {

From 1dd089d0eec060dcd8478735114d98421d414805 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:13 -0400
Subject: [PATCH 32/37] target-arm: emulate aarch64's LL/SC using cmpxchg
 helpers

Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.

The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.

Hi-res plots: http://imgur.com/a/JVc8Y

                atomic_add-bench: 1000000 ops/thread, [0,1] range

  18 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  16 ++master +-H--+                                                      ++
     ||                                                                    |
  14 ++                                                                   ++
     | |                                                                   |
  12 ++|                                                                  ++
     | |                                                                   |
  10 ++++                                                                 ++
   8 ++E                                                                  ++
     |+++                                                                  |
   6 ++ |                                                                 ++
     |  |                                                                  |
   4 ++ |                                                                 ++
     |   |                                                                 |
   2 +H++E+---                                                            ++
     + |     +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E|
   0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

                atomic_add-bench: 1000000 ops/thread, [0,2] range

  18 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  16 ++master +-H--+                                                      ++
     | |                                                                   |
  14 ++E                                                                  ++
     | |                                                                   |
  12 ++|                                                                  ++
     |+++                                                                  |
  10 ++ |                                                                 ++
   8 ++ |                                                                 ++
     |  |                                                                  |
   6 ++ |                                                                 ++
     |   |                                                                 |
   4 ++  |                                                                ++
     |  +E+---                                                             |
   2 +H+     +E+-----+++              +++      +++   ---+E+-----+E+------+++
     +++        +    +E+---+--+E+----++E+------+E+---   ++++    +++   +  +E|
   0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

               atomic_add-bench: 1000000 ops/thread, [0,128] range

  70 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  60 ++master +-H--+                  +++            ---+E+-----+E+------+E+
     |                        +E+------E-------+E+---                      |
     |                     ---        +++                                  |
  50 ++              +++---                                               ++
     |              -+E+                                                   |
  40 ++      +++----                                                      ++
     |        E-                                                           |
     |      --|                                                            |
  30 ++   -- +++                                                          ++
     |  +E+                                                                |
  20 ++E+                                                                 ++
     |E+                                                                   |
     |                                                                     |
  10 ++                                                                   ++
     +          +          +         +          +          +          +    |
   0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

              atomic_add-bench: 1000000 ops/thread, [0,1024] range

  160 ++---------+---------+----------+---------+----------+----------+---++
      +cmpxchg +-E--+      +          +         +          +          +    |
  140 ++master +-H--+                                           +++      +++
      |                                                -+E+-----+E+-------E|
  120 ++                                       +++ ----                  +++
      |                                +++  ----E--                        |
  100 ++                              --E---   +++                        ++
      |                       +++ ---- +++                                 |
   80 ++                     --E--                                        ++
      |                  ---- +++                                          |
      |              -+E+                                                  |
   60 ++         ---- +++                                                 ++
      |      +E+-                                                          |
   40 ++   --                                                             ++
      |  +E+                                                               |
   20 +EE+                                                                ++
      +++        +         +          +         +          +          +    |
    0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

[rth: Rearrange 128-bit cmpxchg helper.  Enforce alignment on LL.]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-28-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-arm/helper-a64.c    | 113 +++++++++++++++++++++++++++++++++++++
 target-arm/helper-a64.h    |   2 +
 target-arm/translate-a64.c | 106 ++++++++++++++++------------------
 3 files changed, 163 insertions(+), 58 deletions(-)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 41e48a41b471..98b97df4612b 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -27,6 +27,10 @@
 #include "qemu/bitops.h"
 #include "internals.h"
 #include "qemu/crc32c.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "qemu/int128.h"
+#include "tcg.h"
 #include <zlib.h> /* For crc32 */
 
 /* C2.4.7 Multiply and divide */
@@ -444,3 +448,112 @@ uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
     /* Linux crc32c converts the output to one's complement.  */
     return crc32c(acc, buf, bytes) ^ 0xffffffff;
 }
+
+/* Returns 0 on success; 1 otherwise.  */
+uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
+                                     uint64_t new_lo, uint64_t new_hi)
+{
+    uintptr_t ra = GETPC();
+    Int128 oldv, cmpv, newv;
+    bool success;
+
+    cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+    newv = int128_make128(new_lo, new_hi);
+
+    if (parallel_cpus) {
+#ifndef CONFIG_ATOMIC128
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+#else
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
+        oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
+        success = int128_eq(oldv, cmpv);
+#endif
+    } else {
+        uint64_t o0, o1;
+
+#ifdef CONFIG_USER_ONLY
+        /* ??? Enforce alignment.  */
+        uint64_t *haddr = g2h(addr);
+        o0 = ldq_le_p(haddr + 0);
+        o1 = ldq_le_p(haddr + 1);
+        oldv = int128_make128(o0, o1);
+
+        success = int128_eq(oldv, cmpv);
+        if (success) {
+            stq_le_p(haddr + 0, int128_getlo(newv));
+            stq_le_p(haddr + 1, int128_gethi(newv));
+        }
+#else
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
+        TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
+
+        o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
+        o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
+        oldv = int128_make128(o0, o1);
+
+        success = int128_eq(oldv, cmpv);
+        if (success) {
+            helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
+            helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
+        }
+#endif
+    }
+
+    return !success;
+}
+
+uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
+                                     uint64_t new_lo, uint64_t new_hi)
+{
+    uintptr_t ra = GETPC();
+    Int128 oldv, cmpv, newv;
+    bool success;
+
+    cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+    newv = int128_make128(new_lo, new_hi);
+
+    if (parallel_cpus) {
+#ifndef CONFIG_ATOMIC128
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+#else
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
+        oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
+        success = int128_eq(oldv, cmpv);
+#endif
+    } else {
+        uint64_t o0, o1;
+
+#ifdef CONFIG_USER_ONLY
+        /* ??? Enforce alignment.  */
+        uint64_t *haddr = g2h(addr);
+        o1 = ldq_be_p(haddr + 0);
+        o0 = ldq_be_p(haddr + 1);
+        oldv = int128_make128(o0, o1);
+
+        success = int128_eq(oldv, cmpv);
+        if (success) {
+            stq_be_p(haddr + 0, int128_gethi(newv));
+            stq_be_p(haddr + 1, int128_getlo(newv));
+        }
+#else
+        int mem_idx = cpu_mmu_index(env, false);
+        TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
+        TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
+
+        o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
+        o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
+        oldv = int128_make128(o0, o1);
+
+        success = int128_eq(oldv, cmpv);
+        if (success) {
+            helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
+            helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
+        }
+#endif
+    }
+
+    return !success;
+}
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index 1d3d10fffbce..dd32000e6300 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -46,3 +46,5 @@ DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
 DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
+DEF_HELPER_FLAGS_4(paired_cmpxchg64_le, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
+DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 96c222722e3d..ded924a0a903 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1839,37 +1839,41 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
     }
 }
 
-/*
- * Load/Store exclusive instructions are implemented by remembering
- * the value/address loaded, and seeing if these are the same
- * when the store is performed. This is not actually the architecturally
- * mandated semantics, but it works for typical guest code sequences
- * and avoids having to monitor regular stores.
- *
- * In system emulation mode only one CPU will be running at once, so
- * this sequence is effectively atomic.  In user emulation mode we
- * throw an exception and handle the atomic operation elsewhere.
- */
 static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
                                TCGv_i64 addr, int size, bool is_pair)
 {
     TCGv_i64 tmp = tcg_temp_new_i64();
-    TCGMemOp memop = s->be_data + size;
+    TCGMemOp be = s->be_data;
 
     g_assert(size <= 3);
-    tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), memop);
-
     if (is_pair) {
-        TCGv_i64 addr2 = tcg_temp_new_i64();
         TCGv_i64 hitmp = tcg_temp_new_i64();
 
-        g_assert(size >= 2);
-        tcg_gen_addi_i64(addr2, addr, 1 << size);
-        tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s), memop);
-        tcg_temp_free_i64(addr2);
+        if (size == 3) {
+            TCGv_i64 addr2 = tcg_temp_new_i64();
+
+            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),
+                                MO_64 | MO_ALIGN_16 | be);
+            tcg_gen_addi_i64(addr2, addr, 8);
+            tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s),
+                                MO_64 | MO_ALIGN | be);
+            tcg_temp_free_i64(addr2);
+        } else {
+            g_assert(size == 2);
+            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),
+                                MO_64 | MO_ALIGN | be);
+            if (be == MO_LE) {
+                tcg_gen_extr32_i64(tmp, hitmp, tmp);
+            } else {
+                tcg_gen_extr32_i64(hitmp, tmp, tmp);
+            }
+        }
+
         tcg_gen_mov_i64(cpu_exclusive_high, hitmp);
         tcg_gen_mov_i64(cpu_reg(s, rt2), hitmp);
         tcg_temp_free_i64(hitmp);
+    } else {
+        tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), size | MO_ALIGN | be);
     }
 
     tcg_gen_mov_i64(cpu_exclusive_val, tmp);
@@ -1879,16 +1883,6 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
     tcg_gen_mov_i64(cpu_exclusive_addr, addr);
 }
 
-#ifdef CONFIG_USER_ONLY
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv_i64 addr, int size, int is_pair)
-{
-    tcg_gen_mov_i64(cpu_exclusive_test, addr);
-    tcg_gen_movi_i32(cpu_exclusive_info,
-                     size | is_pair << 2 | (rd << 4) | (rt << 9) | (rt2 << 14));
-    gen_exception_internal_insn(s, 4, EXCP_STREX);
-}
-#else
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
                                 TCGv_i64 inaddr, int size, int is_pair)
 {
@@ -1916,46 +1910,42 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
     tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);
 
     tmp = tcg_temp_new_i64();
-    tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), s->be_data + size);
-    tcg_gen_brcond_i64(TCG_COND_NE, tmp, cpu_exclusive_val, fail_label);
-    tcg_temp_free_i64(tmp);
-
-    if (is_pair) {
-        TCGv_i64 addrhi = tcg_temp_new_i64();
-        TCGv_i64 tmphi = tcg_temp_new_i64();
-
-        tcg_gen_addi_i64(addrhi, addr, 1 << size);
-        tcg_gen_qemu_ld_i64(tmphi, addrhi, get_mem_index(s),
-                            s->be_data + size);
-        tcg_gen_brcond_i64(TCG_COND_NE, tmphi, cpu_exclusive_high, fail_label);
-
-        tcg_temp_free_i64(tmphi);
-        tcg_temp_free_i64(addrhi);
-    }
-
-    /* We seem to still have the exclusive monitor, so do the store */
-    tcg_gen_qemu_st_i64(cpu_reg(s, rt), addr, get_mem_index(s),
-                        s->be_data + size);
     if (is_pair) {
-        TCGv_i64 addrhi = tcg_temp_new_i64();
-
-        tcg_gen_addi_i64(addrhi, addr, 1 << size);
-        tcg_gen_qemu_st_i64(cpu_reg(s, rt2), addrhi,
-                            get_mem_index(s), s->be_data + size);
-        tcg_temp_free_i64(addrhi);
+        if (size == 2) {
+            TCGv_i64 val = tcg_temp_new_i64();
+            tcg_gen_concat32_i64(tmp, cpu_reg(s, rt), cpu_reg(s, rt2));
+            tcg_gen_concat32_i64(val, cpu_exclusive_val, cpu_exclusive_high);
+            tcg_gen_atomic_cmpxchg_i64(tmp, addr, val, tmp,
+                                       get_mem_index(s),
+                                       size | MO_ALIGN | s->be_data);
+            tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, val);
+            tcg_temp_free_i64(val);
+        } else if (s->be_data == MO_LE) {
+            gen_helper_paired_cmpxchg64_le(tmp, cpu_env, addr, cpu_reg(s, rt),
+                                           cpu_reg(s, rt2));
+        } else {
+            gen_helper_paired_cmpxchg64_be(tmp, cpu_env, addr, cpu_reg(s, rt),
+                                           cpu_reg(s, rt2));
+        }
+    } else {
+        TCGv_i64 val = cpu_reg(s, rt);
+        tcg_gen_atomic_cmpxchg_i64(tmp, addr, cpu_exclusive_val, val,
+                                   get_mem_index(s),
+                                   size | MO_ALIGN | s->be_data);
+        tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
     }
 
     tcg_temp_free_i64(addr);
 
-    tcg_gen_movi_i64(cpu_reg(s, rd), 0);
+    tcg_gen_mov_i64(cpu_reg(s, rd), tmp);
+    tcg_temp_free_i64(tmp);
     tcg_gen_br(done_label);
+
     gen_set_label(fail_label);
     tcg_gen_movi_i64(cpu_reg(s, rd), 1);
     gen_set_label(done_label);
     tcg_gen_movi_i64(cpu_exclusive_addr, -1);
-
 }
-#endif
 
 /* Update the Sixty-Four bit (SF) registersize. This logic is derived
  * from the ARMv8 specs for LDR (Shared decode for all encodings).

From b50b82fc486a227c27481d60c75c5ee7cc282028 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:14 -0400
Subject: [PATCH 33/37] linux-user: remove handling of ARM's EXCP_STREX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The exception is not emitted anymore.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <rth@twidle.net>
Message-Id: <1467054136-10430-29-git-send-email-cota@braap.org>
---
 linux-user/main.c | 93 -----------------------------------------------
 1 file changed, 93 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index f0c47d49ab85..f9bac444f347 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -553,94 +553,6 @@ do_kernel_trap(CPUARMState *env)
     return 0;
 }
 
-/* Store exclusive handling for AArch32 */
-static int do_strex(CPUARMState *env)
-{
-    uint64_t val;
-    int size;
-    int rc = 1;
-    int segv = 0;
-    uint32_t addr;
-    start_exclusive();
-    if (env->exclusive_addr != env->exclusive_test) {
-        goto fail;
-    }
-    /* We know we're always AArch32 so the address is in uint32_t range
-     * unless it was the -1 exclusive-monitor-lost value (which won't
-     * match exclusive_test above).
-     */
-    assert(extract64(env->exclusive_addr, 32, 32) == 0);
-    addr = env->exclusive_addr;
-    size = env->exclusive_info & 0xf;
-    switch (size) {
-    case 0:
-        segv = get_user_u8(val, addr);
-        break;
-    case 1:
-        segv = get_user_data_u16(val, addr, env);
-        break;
-    case 2:
-    case 3:
-        segv = get_user_data_u32(val, addr, env);
-        break;
-    default:
-        abort();
-    }
-    if (segv) {
-        env->exception.vaddress = addr;
-        goto done;
-    }
-    if (size == 3) {
-        uint32_t valhi;
-        segv = get_user_data_u32(valhi, addr + 4, env);
-        if (segv) {
-            env->exception.vaddress = addr + 4;
-            goto done;
-        }
-        if (arm_cpu_bswap_data(env)) {
-            val = deposit64((uint64_t)valhi, 32, 32, val);
-        } else {
-            val = deposit64(val, 32, 32, valhi);
-        }
-    }
-    if (val != env->exclusive_val) {
-        goto fail;
-    }
-
-    val = env->regs[(env->exclusive_info >> 8) & 0xf];
-    switch (size) {
-    case 0:
-        segv = put_user_u8(val, addr);
-        break;
-    case 1:
-        segv = put_user_data_u16(val, addr, env);
-        break;
-    case 2:
-    case 3:
-        segv = put_user_data_u32(val, addr, env);
-        break;
-    }
-    if (segv) {
-        env->exception.vaddress = addr;
-        goto done;
-    }
-    if (size == 3) {
-        val = env->regs[(env->exclusive_info >> 12) & 0xf];
-        segv = put_user_data_u32(val, addr + 4, env);
-        if (segv) {
-            env->exception.vaddress = addr + 4;
-            goto done;
-        }
-    }
-    rc = 0;
-fail:
-    env->regs[15] += 4;
-    env->regs[(env->exclusive_info >> 4) & 0xf] = rc;
-done:
-    end_exclusive();
-    return segv;
-}
-
 void cpu_loop(CPUARMState *env)
 {
     CPUState *cs = CPU(arm_env_get_cpu(env));
@@ -815,11 +727,6 @@ void cpu_loop(CPUARMState *env)
         case EXCP_INTERRUPT:
             /* just indicate that signals should be handled asap */
             break;
-        case EXCP_STREX:
-            if (!do_strex(env)) {
-                break;
-            }
-            /* fall through for segv */
         case EXCP_PREFETCH_ABORT:
         case EXCP_DATA_ABORT:
             addr = env->exception.vaddress;

From f4e6eb7ffeefb3f2e9fff0bbe5eb7c9962c31dcd Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:15 -0400
Subject: [PATCH 34/37] linux-user: remove handling of aarch64's EXCP_STREX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The exception is not emitted anymore.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1467054136-10430-30-git-send-email-cota@braap.org>
---
 linux-user/main.c | 125 ----------------------------------------------
 1 file changed, 125 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index f9bac444f347..5ff06ac86c2f 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -775,124 +775,6 @@ void cpu_loop(CPUARMState *env)
 
 #else
 
-/*
- * Handle AArch64 store-release exclusive
- *
- * rs = gets the status result of store exclusive
- * rt = is the register that is stored
- * rt2 = is the second register store (in STP)
- *
- */
-static int do_strex_a64(CPUARMState *env)
-{
-    uint64_t val;
-    int size;
-    bool is_pair;
-    int rc = 1;
-    int segv = 0;
-    uint64_t addr;
-    int rs, rt, rt2;
-
-    start_exclusive();
-    /* size | is_pair << 2 | (rs << 4) | (rt << 9) | (rt2 << 14)); */
-    size = extract32(env->exclusive_info, 0, 2);
-    is_pair = extract32(env->exclusive_info, 2, 1);
-    rs = extract32(env->exclusive_info, 4, 5);
-    rt = extract32(env->exclusive_info, 9, 5);
-    rt2 = extract32(env->exclusive_info, 14, 5);
-
-    addr = env->exclusive_addr;
-
-    if (addr != env->exclusive_test) {
-        goto finish;
-    }
-
-    switch (size) {
-    case 0:
-        segv = get_user_u8(val, addr);
-        break;
-    case 1:
-        segv = get_user_u16(val, addr);
-        break;
-    case 2:
-        segv = get_user_u32(val, addr);
-        break;
-    case 3:
-        segv = get_user_u64(val, addr);
-        break;
-    default:
-        abort();
-    }
-    if (segv) {
-        env->exception.vaddress = addr;
-        goto error;
-    }
-    if (val != env->exclusive_val) {
-        goto finish;
-    }
-    if (is_pair) {
-        if (size == 2) {
-            segv = get_user_u32(val, addr + 4);
-        } else {
-            segv = get_user_u64(val, addr + 8);
-        }
-        if (segv) {
-            env->exception.vaddress = addr + (size == 2 ? 4 : 8);
-            goto error;
-        }
-        if (val != env->exclusive_high) {
-            goto finish;
-        }
-    }
-    /* handle the zero register */
-    val = rt == 31 ? 0 : env->xregs[rt];
-    switch (size) {
-    case 0:
-        segv = put_user_u8(val, addr);
-        break;
-    case 1:
-        segv = put_user_u16(val, addr);
-        break;
-    case 2:
-        segv = put_user_u32(val, addr);
-        break;
-    case 3:
-        segv = put_user_u64(val, addr);
-        break;
-    }
-    if (segv) {
-        goto error;
-    }
-    if (is_pair) {
-        /* handle the zero register */
-        val = rt2 == 31 ? 0 : env->xregs[rt2];
-        if (size == 2) {
-            segv = put_user_u32(val, addr + 4);
-        } else {
-            segv = put_user_u64(val, addr + 8);
-        }
-        if (segv) {
-            env->exception.vaddress = addr + (size == 2 ? 4 : 8);
-            goto error;
-        }
-    }
-    rc = 0;
-finish:
-    env->pc += 4;
-    /* rs == 31 encodes a write to the ZR, thus throwing away
-     * the status return. This is rather silly but valid.
-     */
-    if (rs < 31) {
-        env->xregs[rs] = rc;
-    }
-error:
-    /* instruction faulted, PC does not advance */
-    /* either way a strex releases any exclusive lock we have */
-    env->exclusive_addr = -1;
-    end_exclusive();
-    return segv;
-}
-
 /* AArch64 main loop */
 void cpu_loop(CPUARMState *env)
 {
@@ -934,11 +816,6 @@ void cpu_loop(CPUARMState *env)
             info._sifields._sigfault._addr = env->pc;
             queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
             break;
-        case EXCP_STREX:
-            if (!do_strex_a64(env)) {
-                break;
-            }
-            /* fall through for segv */
         case EXCP_PREFETCH_ABORT:
         case EXCP_DATA_ABORT:
             info.si_signo = TARGET_SIGSEGV;
@@ -974,8 +851,6 @@ void cpu_loop(CPUARMState *env)
         process_pending_signals(env);
         /* Exception return on AArch64 always clears the exclusive monitor,
          * so any return to running guest code implies this.
-         * A strex (successful or otherwise) also clears the monitor, so
-         * we don't need to specialcase EXCP_STREX.
          */
         env->exclusive_addr = -1;
     }

From 05188cc72f0399e99c92f608a8e7ca4c8e552c4b Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Mon, 27 Jun 2016 15:02:16 -0400
Subject: [PATCH 35/37] target-arm: remove EXCP_STREX + cpu_exclusive_{test,
 info}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The exception is not emitted anymore; remove it and the associated
TCG variables.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1467054136-10430-31-git-send-email-cota@braap.org>
---
 target-arm/cpu.h       |  5 -----
 target-arm/internals.h |  4 +---
 target-arm/translate.c | 10 ----------
 target-arm/translate.h |  4 ----
 4 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 9d75227e044a..19d967b69e8e 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -46,7 +46,6 @@
 #define EXCP_BKPT            7
 #define EXCP_EXCEPTION_EXIT  8   /* Return from v7M exception.  */
 #define EXCP_KERNEL_TRAP     9   /* Jumped to kernel code page.  */
-#define EXCP_STREX          10
 #define EXCP_HVC            11   /* HyperVisor Call */
 #define EXCP_HYP_TRAP       12
 #define EXCP_SMC            13   /* Secure Monitor Call */
@@ -475,10 +474,6 @@ typedef struct CPUARMState {
     uint64_t exclusive_addr;
     uint64_t exclusive_val;
     uint64_t exclusive_high;
-#if defined(CONFIG_USER_ONLY)
-    uint64_t exclusive_test;
-    uint32_t exclusive_info;
-#endif
 
     /* iwMMXt coprocessor state.  */
     struct {
diff --git a/target-arm/internals.h b/target-arm/internals.h
index cd574017c6b4..3edccd252920 100644
--- a/target-arm/internals.h
+++ b/target-arm/internals.h
@@ -46,8 +46,7 @@ static inline bool excp_is_internal(int excp)
         || excp == EXCP_HALTED
         || excp == EXCP_EXCEPTION_EXIT
         || excp == EXCP_KERNEL_TRAP
-        || excp == EXCP_SEMIHOST
-        || excp == EXCP_STREX;
+        || excp == EXCP_SEMIHOST;
 }
 
 /* Exception names for debug logging; note that not all of these
@@ -63,7 +62,6 @@ static const char * const excnames[] = {
     [EXCP_BKPT] = "Breakpoint",
     [EXCP_EXCEPTION_EXIT] = "QEMU v7M exception exit",
     [EXCP_KERNEL_TRAP] = "QEMU intercept of kernel commpage",
-    [EXCP_STREX] = "QEMU intercept of STREX",
     [EXCP_HVC] = "Hypervisor Call",
     [EXCP_HYP_TRAP] = "Hypervisor Trap",
     [EXCP_SMC] = "Secure Monitor Call",
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 7c32c9462aef..718f7d06f384 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -65,10 +65,6 @@ static TCGv_i32 cpu_R[16];
 TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
 TCGv_i64 cpu_exclusive_addr;
 TCGv_i64 cpu_exclusive_val;
-#ifdef CONFIG_USER_ONLY
-TCGv_i64 cpu_exclusive_test;
-TCGv_i32 cpu_exclusive_info;
-#endif
 
 /* FIXME:  These should be removed.  */
 static TCGv_i32 cpu_F0s, cpu_F1s;
@@ -102,12 +98,6 @@ void arm_translate_init(void)
         offsetof(CPUARMState, exclusive_addr), "exclusive_addr");
     cpu_exclusive_val = tcg_global_mem_new_i64(cpu_env,
         offsetof(CPUARMState, exclusive_val), "exclusive_val");
-#ifdef CONFIG_USER_ONLY
-    cpu_exclusive_test = tcg_global_mem_new_i64(cpu_env,
-        offsetof(CPUARMState, exclusive_test), "exclusive_test");
-    cpu_exclusive_info = tcg_global_mem_new_i32(cpu_env,
-        offsetof(CPUARMState, exclusive_info), "exclusive_info");
-#endif
 
     a64_translate_init();
 }
diff --git a/target-arm/translate.h b/target-arm/translate.h
index a53f25a40737..285e96f08732 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -79,10 +79,6 @@ extern TCGv_env cpu_env;
 extern TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF;
 extern TCGv_i64 cpu_exclusive_addr;
 extern TCGv_i64 cpu_exclusive_val;
-#ifdef CONFIG_USER_ONLY
-extern TCGv_i64 cpu_exclusive_test;
-extern TCGv_i32 cpu_exclusive_info;
-#endif
 
 static inline int arm_dc_feature(DisasContext *dc, int feature)
 {

From 6a73ecf5cfcd39b7afb5d6a24174730eac49d4b5 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Sat, 3 Sep 2016 11:32:35 -0700
Subject: [PATCH 36/37] target-alpha: Introduce MMU_PHYS_IDX

Rather than using helpers for physical accesses, use a mmu index.
The primary cleanup is with store-conditional on physical addresses.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-alpha/cpu.h        | 18 ++++------
 target-alpha/helper.c     |  8 +++++
 target-alpha/helper.h     |  9 -----
 target-alpha/mem_helper.c | 73 ---------------------------------------
 target-alpha/translate.c  | 50 ++++++++++++++++-----------
 5 files changed, 44 insertions(+), 114 deletions(-)

diff --git a/target-alpha/cpu.h b/target-alpha/cpu.h
index dcdd0416bdd7..871d9baa35c0 100644
--- a/target-alpha/cpu.h
+++ b/target-alpha/cpu.h
@@ -201,7 +201,7 @@ enum {
 
 /* MMU modes definitions */
 
-/* Alpha has 5 MMU modes: PALcode, kernel, executive, supervisor, and user.
+/* Alpha has 5 MMU modes: PALcode, Kernel, Executive, Supervisor, and User.
    The Unix PALcode only exposes the kernel and user modes; presumably
    executive and supervisor are used by VMS.
 
@@ -209,22 +209,18 @@ enum {
    there are PALmode instructions that can access data via physical mode
    or via an os-installed "alternate mode", which is one of the 4 above.
 
-   QEMU does not currently properly distinguish between code/data when
-   looking up addresses.  To avoid having to address this issue, our
-   emulated PALcode will cheat and use the KSEG mapping for its code+data
-   rather than physical addresses.
+   That said, we're only emulating Unix PALcode, and not attempting VMS,
+   so we don't need to implement Executive and Supervisor.  QEMU's own
+   PALcode cheats and usees the KSEG mapping for its code+data rather than
+   physical addresses.  */
 
-   Moreover, we're only emulating Unix PALcode, and not attempting VMS.
-
-   All of which allows us to drop all but kernel and user modes.
-   Elide the unused MMU modes to save space.  */
-
-#define NB_MMU_MODES 2
+#define NB_MMU_MODES 3
 
 #define MMU_MODE0_SUFFIX _kernel
 #define MMU_MODE1_SUFFIX _user
 #define MMU_KERNEL_IDX   0
 #define MMU_USER_IDX     1
+#define MMU_PHYS_IDX     2
 
 typedef struct CPUAlphaState CPUAlphaState;
 
diff --git a/target-alpha/helper.c b/target-alpha/helper.c
index 85168b7ed164..9ba3e1a1b857 100644
--- a/target-alpha/helper.c
+++ b/target-alpha/helper.c
@@ -126,6 +126,14 @@ static int get_physical_address(CPUAlphaState *env, target_ulong addr,
     int prot = 0;
     int ret = MM_K_ACV;
 
+    /* Handle physical accesses.  */
+    if (mmu_idx == MMU_PHYS_IDX) {
+        phys = addr;
+        prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+        ret = -1;
+        goto exit;
+    }
+
     /* Ensure that the virtual address is properly sign-extended from
        the last implemented virtual address bit.  */
     if (saddr >> TARGET_VIRT_ADDR_SPACE_BITS != saddr >> 63) {
diff --git a/target-alpha/helper.h b/target-alpha/helper.h
index c3d8a3ee491e..004221df8c03 100644
--- a/target-alpha/helper.h
+++ b/target-alpha/helper.h
@@ -92,15 +92,6 @@ DEF_HELPER_FLAGS_2(ieee_input_cmp, TCG_CALL_NO_WG, void, env, i64)
 DEF_HELPER_FLAGS_2(ieee_input_s, TCG_CALL_NO_WG, void, env, i64)
 
 #if !defined (CONFIG_USER_ONLY)
-DEF_HELPER_2(ldl_phys, i64, env, i64)
-DEF_HELPER_2(ldq_phys, i64, env, i64)
-DEF_HELPER_2(ldl_l_phys, i64, env, i64)
-DEF_HELPER_2(ldq_l_phys, i64, env, i64)
-DEF_HELPER_3(stl_phys, void, env, i64, i64)
-DEF_HELPER_3(stq_phys, void, env, i64, i64)
-DEF_HELPER_3(stl_c_phys, i64, env, i64, i64)
-DEF_HELPER_3(stq_c_phys, i64, env, i64, i64)
-
 DEF_HELPER_FLAGS_1(tbia, TCG_CALL_NO_RWG, void, env)
 DEF_HELPER_FLAGS_2(tbis, TCG_CALL_NO_RWG, void, env, i64)
 DEF_HELPER_FLAGS_1(tb_flush, TCG_CALL_NO_RWG, void, env)
diff --git a/target-alpha/mem_helper.c b/target-alpha/mem_helper.c
index 1b2be50be785..78a7d4559046 100644
--- a/target-alpha/mem_helper.c
+++ b/target-alpha/mem_helper.c
@@ -25,79 +25,6 @@
 
 /* Softmmu support */
 #ifndef CONFIG_USER_ONLY
-
-uint64_t helper_ldl_phys(CPUAlphaState *env, uint64_t p)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    return (int32_t)ldl_phys(cs->as, p);
-}
-
-uint64_t helper_ldq_phys(CPUAlphaState *env, uint64_t p)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    return ldq_phys(cs->as, p);
-}
-
-uint64_t helper_ldl_l_phys(CPUAlphaState *env, uint64_t p)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    env->lock_addr = p;
-    return env->lock_value = (int32_t)ldl_phys(cs->as, p);
-}
-
-uint64_t helper_ldq_l_phys(CPUAlphaState *env, uint64_t p)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    env->lock_addr = p;
-    return env->lock_value = ldq_phys(cs->as, p);
-}
-
-void helper_stl_phys(CPUAlphaState *env, uint64_t p, uint64_t v)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    stl_phys(cs->as, p, v);
-}
-
-void helper_stq_phys(CPUAlphaState *env, uint64_t p, uint64_t v)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    stq_phys(cs->as, p, v);
-}
-
-uint64_t helper_stl_c_phys(CPUAlphaState *env, uint64_t p, uint64_t v)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    uint64_t ret = 0;
-
-    if (p == env->lock_addr) {
-        int32_t old = ldl_phys(cs->as, p);
-        if (old == (int32_t)env->lock_value) {
-            stl_phys(cs->as, p, v);
-            ret = 1;
-        }
-    }
-    env->lock_addr = -1;
-
-    return ret;
-}
-
-uint64_t helper_stq_c_phys(CPUAlphaState *env, uint64_t p, uint64_t v)
-{
-    CPUState *cs = CPU(alpha_env_get_cpu(env));
-    uint64_t ret = 0;
-
-    if (p == env->lock_addr) {
-        uint64_t old = ldq_phys(cs->as, p);
-        if (old == env->lock_value) {
-            stq_phys(cs->as, p, v);
-            ret = 1;
-        }
-    }
-    env->lock_addr = -1;
-
-    return ret;
-}
-
 void alpha_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
                                    MMUAccessType access_type,
                                    int mmu_idx, uintptr_t retaddr)
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index c27c7b9cc499..a2e2a62098cc 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -392,7 +392,8 @@ static inline void gen_store_mem(DisasContext *ctx,
 }
 
 static ExitStatus gen_store_conditional(DisasContext *ctx, int ra, int rb,
-                                        int32_t disp16, int quad)
+                                        int32_t disp16, int mem_idx,
+                                        TCGMemOp op)
 {
     TCGv addr;
 
@@ -414,7 +415,7 @@ static ExitStatus gen_store_conditional(DisasContext *ctx, int ra, int rb,
     /* ??? This is handled via a complicated version of compare-and-swap
        in the cpu_loop.  Hopefully one day we'll have a real CAS opcode
        in TCG so that this isn't necessary.  */
-    return gen_excp(ctx, quad ? EXCP_STQ_C : EXCP_STL_C, ra);
+    return gen_excp(ctx, (op & MO_SIZE) == MO_64 ? EXCP_STQ_C : EXCP_STL_C, ra);
 #else
     /* ??? In system mode we are never multi-threaded, so CAS can be
        implemented via a non-atomic load-compare-store sequence.  */
@@ -427,11 +428,10 @@ static ExitStatus gen_store_conditional(DisasContext *ctx, int ra, int rb,
         tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_lock_addr, lab_fail);
 
         val = tcg_temp_new();
-        tcg_gen_qemu_ld_i64(val, addr, ctx->mem_idx, quad ? MO_LEQ : MO_LESL);
+        tcg_gen_qemu_ld_i64(val, addr, mem_idx, op);
         tcg_gen_brcond_i64(TCG_COND_NE, val, cpu_lock_value, lab_fail);
 
-        tcg_gen_qemu_st_i64(ctx->ir[ra], addr, ctx->mem_idx,
-                            quad ? MO_LEQ : MO_LEUL);
+        tcg_gen_qemu_st_i64(ctx->ir[ra], addr, mem_idx, op);
         tcg_gen_movi_i64(ctx->ir[ra], 1);
         tcg_gen_br(lab_done);
 
@@ -2423,19 +2423,19 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
             switch ((insn >> 12) & 0xF) {
             case 0x0:
                 /* Longword physical access (hw_ldl/p) */
-                gen_helper_ldl_phys(va, cpu_env, addr);
+                tcg_gen_qemu_ld_i64(va, addr, MMU_PHYS_IDX, MO_LESL);
                 break;
             case 0x1:
                 /* Quadword physical access (hw_ldq/p) */
-                gen_helper_ldq_phys(va, cpu_env, addr);
+                tcg_gen_qemu_ld_i64(va, addr, MMU_PHYS_IDX, MO_LEQ);
                 break;
             case 0x2:
                 /* Longword physical access with lock (hw_ldl_l/p) */
-                gen_helper_ldl_l_phys(va, cpu_env, addr);
+                gen_qemu_ldl_l(va, addr, MMU_PHYS_IDX);
                 break;
             case 0x3:
                 /* Quadword physical access with lock (hw_ldq_l/p) */
-                gen_helper_ldq_l_phys(va, cpu_env, addr);
+                gen_qemu_ldq_l(va, addr, MMU_PHYS_IDX);
                 break;
             case 0x4:
                 /* Longword virtual PTE fetch (hw_ldl/v) */
@@ -2674,27 +2674,34 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
 #ifndef CONFIG_USER_ONLY
         REQUIRE_TB_FLAG(TB_FLAGS_PAL_MODE);
         {
-            TCGv addr = tcg_temp_new();
-            va = load_gpr(ctx, ra);
-            vb = load_gpr(ctx, rb);
-
-            tcg_gen_addi_i64(addr, vb, disp12);
             switch ((insn >> 12) & 0xF) {
             case 0x0:
                 /* Longword physical access */
-                gen_helper_stl_phys(cpu_env, addr, va);
+                va = load_gpr(ctx, ra);
+                vb = load_gpr(ctx, rb);
+                tmp = tcg_temp_new();
+                tcg_gen_addi_i64(tmp, vb, disp12);
+                tcg_gen_qemu_st_i64(va, tmp, MMU_PHYS_IDX, MO_LESL);
+                tcg_temp_free(tmp);
                 break;
             case 0x1:
                 /* Quadword physical access */
-                gen_helper_stq_phys(cpu_env, addr, va);
+                va = load_gpr(ctx, ra);
+                vb = load_gpr(ctx, rb);
+                tmp = tcg_temp_new();
+                tcg_gen_addi_i64(tmp, vb, disp12);
+                tcg_gen_qemu_st_i64(va, tmp, MMU_PHYS_IDX, MO_LEQ);
+                tcg_temp_free(tmp);
                 break;
             case 0x2:
                 /* Longword physical access with lock */
-                gen_helper_stl_c_phys(dest_gpr(ctx, ra), cpu_env, addr, va);
+                ret = gen_store_conditional(ctx, ra, rb, disp12,
+                                            MMU_PHYS_IDX, MO_LESL);
                 break;
             case 0x3:
                 /* Quadword physical access with lock */
-                gen_helper_stq_c_phys(dest_gpr(ctx, ra), cpu_env, addr, va);
+                ret = gen_store_conditional(ctx, ra, rb, disp12,
+                                            MMU_PHYS_IDX, MO_LEQ);
                 break;
             case 0x4:
                 /* Longword virtual access */
@@ -2733,7 +2740,6 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
                 /* Invalid */
                 goto invalid_opc;
             }
-            tcg_temp_free(addr);
             break;
         }
 #else
@@ -2797,11 +2803,13 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
         break;
     case 0x2E:
         /* STL_C */
-        ret = gen_store_conditional(ctx, ra, rb, disp16, 0);
+        ret = gen_store_conditional(ctx, ra, rb, disp16,
+                                    ctx->mem_idx, MO_LESL);
         break;
     case 0x2F:
         /* STQ_C */
-        ret = gen_store_conditional(ctx, ra, rb, disp16, 1);
+        ret = gen_store_conditional(ctx, ra, rb, disp16,
+                                    ctx->mem_idx, MO_LEQ);
         break;
     case 0x30:
         /* BR */

From ed2839166c21e001d15868f4d9591a21aaebd547 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 2 Sep 2016 12:52:28 -0700
Subject: [PATCH 37/37] target-alpha: Emulate LL/SC using cmpxchg helpers

Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem.  However, portable parallel
code is written assuming only cmpxchg which means that in
practice this is a viable alternative.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 linux-user/main.c        |  49 ------------------
 target-alpha/cpu.h       |   4 --
 target-alpha/helper.c    |   6 ---
 target-alpha/machine.c   |   2 -
 target-alpha/translate.c | 104 +++++++++++++++++----------------------
 5 files changed, 45 insertions(+), 120 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index 5ff06ac86c2f..75b199f2743d 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -2926,51 +2926,6 @@ void cpu_loop(CPUM68KState *env)
 #endif /* TARGET_M68K */
 
 #ifdef TARGET_ALPHA
-static void do_store_exclusive(CPUAlphaState *env, int reg, int quad)
-{
-    target_ulong addr, val, tmp;
-    target_siginfo_t info;
-    int ret = 0;
-
-    addr = env->lock_addr;
-    tmp = env->lock_st_addr;
-    env->lock_addr = -1;
-    env->lock_st_addr = 0;
-
-    start_exclusive();
-    mmap_lock();
-
-    if (addr == tmp) {
-        if (quad ? get_user_s64(val, addr) : get_user_s32(val, addr)) {
-            goto do_sigsegv;
-        }
-
-        if (val == env->lock_value) {
-            tmp = env->ir[reg];
-            if (quad ? put_user_u64(tmp, addr) : put_user_u32(tmp, addr)) {
-                goto do_sigsegv;
-            }
-            ret = 1;
-        }
-    }
-    env->ir[reg] = ret;
-    env->pc += 4;
-
-    mmap_unlock();
-    end_exclusive();
-    return;
-
- do_sigsegv:
-    mmap_unlock();
-    end_exclusive();
-
-    info.si_signo = TARGET_SIGSEGV;
-    info.si_errno = 0;
-    info.si_code = TARGET_SEGV_MAPERR;
-    info._sifields._sigfault._addr = addr;
-    queue_signal(env, TARGET_SIGSEGV, QEMU_SI_FAULT, &info);
-}
-
 void cpu_loop(CPUAlphaState *env)
 {
     CPUState *cs = CPU(alpha_env_get_cpu(env));
@@ -3145,10 +3100,6 @@ void cpu_loop(CPUAlphaState *env)
                 queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
             }
             break;
-        case EXCP_STL_C:
-        case EXCP_STQ_C:
-            do_store_exclusive(env, env->error_code, trapnr - EXCP_STL_C);
-            break;
         case EXCP_INTERRUPT:
             /* Just indicate that signals should be handled asap.  */
             break;
diff --git a/target-alpha/cpu.h b/target-alpha/cpu.h
index 871d9baa35c0..b08d1601d124 100644
--- a/target-alpha/cpu.h
+++ b/target-alpha/cpu.h
@@ -230,7 +230,6 @@ struct CPUAlphaState {
     uint64_t pc;
     uint64_t unique;
     uint64_t lock_addr;
-    uint64_t lock_st_addr;
     uint64_t lock_value;
 
     /* The FPCR, and disassembled portions thereof.  */
@@ -346,9 +345,6 @@ enum {
     EXCP_ARITH,
     EXCP_FEN,
     EXCP_CALL_PAL,
-    /* For Usermode emulation.  */
-    EXCP_STL_C,
-    EXCP_STQ_C,
 };
 
 /* Alpha-specific interrupt pending bits.  */
diff --git a/target-alpha/helper.c b/target-alpha/helper.c
index 9ba3e1a1b857..2ef6cbe2a4f3 100644
--- a/target-alpha/helper.c
+++ b/target-alpha/helper.c
@@ -306,12 +306,6 @@ void alpha_cpu_do_interrupt(CPUState *cs)
         case EXCP_CALL_PAL:
             name = "call_pal";
             break;
-        case EXCP_STL_C:
-            name = "stl_c";
-            break;
-        case EXCP_STQ_C:
-            name = "stq_c";
-            break;
         }
         qemu_log("INT %6d: %s(%#x) pc=%016" PRIx64 " sp=%016" PRIx64 "\n",
                  ++count, name, env->error_code, env->pc, env->ir[IR_SP]);
diff --git a/target-alpha/machine.c b/target-alpha/machine.c
index 710b7835b425..b99a123a39f1 100644
--- a/target-alpha/machine.c
+++ b/target-alpha/machine.c
@@ -45,8 +45,6 @@ static VMStateField vmstate_env_fields[] = {
     VMSTATE_UINTTL(unique, CPUAlphaState),
     VMSTATE_UINTTL(lock_addr, CPUAlphaState),
     VMSTATE_UINTTL(lock_value, CPUAlphaState),
-    /* Note that lock_st_addr is not saved; it is a temporary
-       used during the execution of the st[lq]_c insns.  */
 
     VMSTATE_UINT8(ps, CPUAlphaState),
     VMSTATE_UINT8(intr_flag, CPUAlphaState),
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index a2e2a62098cc..03e47765edc6 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -99,7 +99,6 @@ static TCGv cpu_std_ir[31];
 static TCGv cpu_fir[31];
 static TCGv cpu_pc;
 static TCGv cpu_lock_addr;
-static TCGv cpu_lock_st_addr;
 static TCGv cpu_lock_value;
 
 #ifndef CONFIG_USER_ONLY
@@ -116,7 +115,6 @@ void alpha_translate_init(void)
     static const GlobalVar vars[] = {
         DEF_VAR(pc),
         DEF_VAR(lock_addr),
-        DEF_VAR(lock_st_addr),
         DEF_VAR(lock_value),
     };
 
@@ -198,6 +196,23 @@ static TCGv dest_sink(DisasContext *ctx)
     return ctx->sink;
 }
 
+static void free_context_temps(DisasContext *ctx)
+{
+    if (!TCGV_IS_UNUSED_I64(ctx->sink)) {
+        tcg_gen_discard_i64(ctx->sink);
+        tcg_temp_free(ctx->sink);
+        TCGV_UNUSED_I64(ctx->sink);
+    }
+    if (!TCGV_IS_UNUSED_I64(ctx->zero)) {
+        tcg_temp_free(ctx->zero);
+        TCGV_UNUSED_I64(ctx->zero);
+    }
+    if (!TCGV_IS_UNUSED_I64(ctx->lit)) {
+        tcg_temp_free(ctx->lit);
+        TCGV_UNUSED_I64(ctx->lit);
+    }
+}
+
 static TCGv load_gpr(DisasContext *ctx, unsigned reg)
 {
     if (likely(reg < 31)) {
@@ -395,56 +410,37 @@ static ExitStatus gen_store_conditional(DisasContext *ctx, int ra, int rb,
                                         int32_t disp16, int mem_idx,
                                         TCGMemOp op)
 {
-    TCGv addr;
-
-    if (ra == 31) {
-        /* ??? Don't bother storing anything.  The user can't tell
-           the difference, since the zero register always reads zero.  */
-        return NO_EXIT;
-    }
-
-#if defined(CONFIG_USER_ONLY)
-    addr = cpu_lock_st_addr;
-#else
-    addr = tcg_temp_local_new();
-#endif
+    TCGLabel *lab_fail, *lab_done;
+    TCGv addr, val;
 
+    addr = tcg_temp_new_i64();
     tcg_gen_addi_i64(addr, load_gpr(ctx, rb), disp16);
+    free_context_temps(ctx);
 
-#if defined(CONFIG_USER_ONLY)
-    /* ??? This is handled via a complicated version of compare-and-swap
-       in the cpu_loop.  Hopefully one day we'll have a real CAS opcode
-       in TCG so that this isn't necessary.  */
-    return gen_excp(ctx, (op & MO_SIZE) == MO_64 ? EXCP_STQ_C : EXCP_STL_C, ra);
-#else
-    /* ??? In system mode we are never multi-threaded, so CAS can be
-       implemented via a non-atomic load-compare-store sequence.  */
-    {
-        TCGLabel *lab_fail, *lab_done;
-        TCGv val;
+    lab_fail = gen_new_label();
+    lab_done = gen_new_label();
+    tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_lock_addr, lab_fail);
+    tcg_temp_free_i64(addr);
 
-        lab_fail = gen_new_label();
-        lab_done = gen_new_label();
-        tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_lock_addr, lab_fail);
+    val = tcg_temp_new_i64();
+    tcg_gen_atomic_cmpxchg_i64(val, cpu_lock_addr, cpu_lock_value,
+                               load_gpr(ctx, ra), mem_idx, op);
+    free_context_temps(ctx);
 
-        val = tcg_temp_new();
-        tcg_gen_qemu_ld_i64(val, addr, mem_idx, op);
-        tcg_gen_brcond_i64(TCG_COND_NE, val, cpu_lock_value, lab_fail);
-
-        tcg_gen_qemu_st_i64(ctx->ir[ra], addr, mem_idx, op);
-        tcg_gen_movi_i64(ctx->ir[ra], 1);
-        tcg_gen_br(lab_done);
+    if (ra != 31) {
+        tcg_gen_setcond_i64(TCG_COND_EQ, ctx->ir[ra], val, cpu_lock_value);
+    }
+    tcg_temp_free_i64(val);
+    tcg_gen_br(lab_done);
 
-        gen_set_label(lab_fail);
+    gen_set_label(lab_fail);
+    if (ra != 31) {
         tcg_gen_movi_i64(ctx->ir[ra], 0);
-
-        gen_set_label(lab_done);
-        tcg_gen_movi_i64(cpu_lock_addr, -1);
-
-        tcg_temp_free(addr);
-        return NO_EXIT;
     }
-#endif
+
+    gen_set_label(lab_done);
+    tcg_gen_movi_i64(cpu_lock_addr, -1);
+    return NO_EXIT;
 }
 
 static bool in_superpage(DisasContext *ctx, int64_t addr)
@@ -2914,6 +2910,10 @@ void gen_intermediate_code(CPUAlphaState *env, struct TranslationBlock *tb)
     /* Similarly for flush-to-zero.  */
     ctx.tb_ftz = -1;
 
+    TCGV_UNUSED_I64(ctx.zero);
+    TCGV_UNUSED_I64(ctx.sink);
+    TCGV_UNUSED_I64(ctx.lit);
+
     num_insns = 0;
     max_insns = tb->cflags & CF_COUNT_MASK;
     if (max_insns == 0) {
@@ -2948,23 +2948,9 @@ void gen_intermediate_code(CPUAlphaState *env, struct TranslationBlock *tb)
         }
         insn = cpu_ldl_code(env, ctx.pc);
 
-        TCGV_UNUSED_I64(ctx.zero);
-        TCGV_UNUSED_I64(ctx.sink);
-        TCGV_UNUSED_I64(ctx.lit);
-
         ctx.pc += 4;
         ret = translate_one(ctxp, insn);
-
-        if (!TCGV_IS_UNUSED_I64(ctx.sink)) {
-            tcg_gen_discard_i64(ctx.sink);
-            tcg_temp_free(ctx.sink);
-        }
-        if (!TCGV_IS_UNUSED_I64(ctx.zero)) {
-            tcg_temp_free(ctx.zero);
-        }
-        if (!TCGV_IS_UNUSED_I64(ctx.lit)) {
-            tcg_temp_free(ctx.lit);
-        }
+        free_context_temps(ctxp);
 
         /* If we reach a page boundary, are single stepping,
            or exhaust instruction count, stop generation.  */