Merge tag 'pull-tcg-20230915-2' of https://gitlab.com/rth7680/qemu in…

…to staging *: Delete checks for old host definitions tcg/loongarch64: Generate LSX instructions fpu: Add conversions between bfloat16 and [u]int8 fpu: Handle m68k extended precision denormals properly accel/tcg: Improve cputlb i/o organization accel/tcg: Simplify tlb_plugin_lookup accel/tcg: Remove false-negative halted assertion tcg: Add gvec compare with immediate and scalar operand tcg/aarch64: Emit BTI insns at jump landing pads [Resolved conflict between CPUINFO_PMULL and CPUINFO_BTI. --Stefan] * tag 'pull-tcg-20230915-2' of https://gitlab.com/rth7680/qemu: (39 commits) tcg: Map code_gen_buffer with PROT_BTI tcg/aarch64: Emit BTI insns at jump landing pads util/cpuinfo-aarch64: Add CPUINFO_BTI tcg: Add tcg_out_tb_start backend hook fpu: Handle m68k extended precision denormals properly fpu: Add conversions between bfloat16 and [u]int8 accel/tcg: Introduce do_st16_mmio_leN accel/tcg: Introduce do_ld16_mmio_beN accel/tcg: Merge io_writex into do_st_mmio_leN accel/tcg: Merge io_readx into do_ld_mmio_beN accel/tcg: Replace direct use of io_readx/io_writex in do_{ld,st}_1 accel/tcg: Merge cpu_transaction_failed into io_failed plugin: Simplify struct qemu_plugin_hwaddr accel/tcg: Use CPUTLBEntryFull.phys_addr in io_failed accel/tcg: Split out io_prepare and io_failed accel/tcg: Simplify tlb_plugin_lookup target/arm: Use tcg_gen_gvec_cmpi for compare vs 0 tcg: Add gvec compare with immediate and scalar operand tcg/loongarch64: Implement 128-bit load & store tcg/loongarch64: Lower rotli_vec to vrotri ... Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
qemu · Sep 19, 2023 · d775494 · d775494
2 parents 13d6b16 + a97a837
commit d775494
Show file tree

Hide file tree

Showing 39 changed files with 7,535 additions and 509 deletions.
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -100,14 +100,9 @@ static void *mttcg_cpu_thread_fn(void *arg)
                 break;
             case EXCP_HALTED:
                 /*
-                 * during start-up the vCPU is reset and the thread is
-                 * kicked several times. If we don't ensure we go back
-                 * to sleep in the halted state we won't cleanly
-                 * start-up when the vCPU is enabled.
-                 *
-                 * cpu->halted should ensure we sleep in wait_io_event
+                 * Usually cpu->halted is set, but may have already been
+                 * reset by another thread by the time we arrive here.
                  */
-                g_assert(cpu->halted);
                 break;
             case EXCP_ATOMIC:
                 qemu_mutex_unlock_iothread();

diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
@@ -1042,6 +1042,32 @@ DO_CMP2(64)
 #undef DO_CMP1
 #undef DO_CMP2
 
+#define DO_CMP1(NAME, TYPE, OP)                                            \
+void HELPER(NAME)(void *d, void *a, uint64_t b64, uint32_t desc)           \
+{                                                                          \
+    intptr_t oprsz = simd_oprsz(desc);                                     \
+    TYPE inv = simd_data(desc), b = b64;                                   \
+    for (intptr_t i = 0; i < oprsz; i += sizeof(TYPE)) {                   \
+        *(TYPE *)(d + i) = -((*(TYPE *)(a + i) OP b) ^ inv);               \
+    }                                                                      \
+    clear_high(d, oprsz, desc);                                            \
+}
+
+#define DO_CMP2(SZ) \
+    DO_CMP1(gvec_eqs##SZ, uint##SZ##_t, ==)    \
+    DO_CMP1(gvec_lts##SZ, int##SZ##_t, <)      \
+    DO_CMP1(gvec_les##SZ, int##SZ##_t, <=)     \
+    DO_CMP1(gvec_ltus##SZ, uint##SZ##_t, <)    \
+    DO_CMP1(gvec_leus##SZ, uint##SZ##_t, <=)
+
+DO_CMP2(8)
+DO_CMP2(16)
+DO_CMP2(32)
+DO_CMP2(64)
+
+#undef DO_CMP1
+#undef DO_CMP2
+
 void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);

diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
@@ -297,4 +297,29 @@ DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(gvec_eqs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_lts8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_les8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ltus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_leus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
 DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
@@ -118,7 +118,8 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
         } else {
             int shift = frac_normalize(p);
             p->cls = float_class_normal;
-            p->exp = fmt->frac_shift - fmt->exp_bias - shift + 1;
+            p->exp = fmt->frac_shift - fmt->exp_bias
+                   - shift + !fmt->m68k_denormal;
         }
     } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
         p->cls = float_class_normal;
@@ -256,7 +257,7 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
             is_tiny = !frac_addi(&discard, p, inc);
         }
 
-        frac_shrjam(p, 1 - exp);
+        frac_shrjam(p, !fmt->m68k_denormal - exp);
 
         if (p->frac_lo & round_mask) {
             /* Need to recompute round-to-even/round-to-odd. */
@@ -287,7 +288,7 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
             p->frac_lo &= ~round_mask;
         }
 
-        exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) != 0;
+        exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) && !fmt->m68k_denormal;
         frac_shr(p, frac_shift);
 
         if (is_tiny && (flags & float_flag_inexact)) {

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
@@ -517,6 +517,7 @@ typedef struct {
  *   round_mask: bits below lsb which must be rounded
  * The following optional modifiers are available:
  *   arm_althp: handle ARM Alternative Half Precision
+ *   m68k_denormal: explicit integer bit for extended precision may be 1
  */
 typedef struct {
     int exp_size;
@@ -526,6 +527,7 @@ typedef struct {
     int frac_size;
     int frac_shift;
     bool arm_althp;
+    bool m68k_denormal;
     uint64_t round_mask;
 } FloatFmt;
 
@@ -576,7 +578,12 @@ static const FloatFmt float128_params = {
 static const FloatFmt floatx80_params[3] = {
     [floatx80_precision_s] = { FLOATX80_PARAMS(23) },
     [floatx80_precision_d] = { FLOATX80_PARAMS(52) },
-    [floatx80_precision_x] = { FLOATX80_PARAMS(64) },
+    [floatx80_precision_x] = {
+        FLOATX80_PARAMS(64),
+#ifdef TARGET_M68K
+        .m68k_denormal = true,
+#endif
+    },
 };
 
 /* Unpack a float to parts, but do not canonicalize.  */
@@ -3126,6 +3133,15 @@ int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
     return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s);
 }
 
+int8_t bfloat16_to_int8_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
+                               float_status *s)
+{
+    FloatParts64 p;
+
+    bfloat16_unpack_canonical(&p, a, s);
+    return parts_float_to_sint(&p, rmode, scale, INT8_MIN, INT8_MAX, s);
+}
+
 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
                                  float_status *s)
 {
@@ -3392,6 +3408,11 @@ int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *s)
     return floatx80_to_int64_scalbn(a, float_round_to_zero, 0, s);
 }
 
+int8_t bfloat16_to_int8(bfloat16 a, float_status *s)
+{
+    return bfloat16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
+}
+
 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
 {
     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
@@ -3407,6 +3428,11 @@ int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
 }
 
+int8_t bfloat16_to_int8_round_to_zero(bfloat16 a, float_status *s)
+{
+    return bfloat16_to_int8_scalbn(a, float_round_to_zero, 0, s);
+}
+
 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
 {
     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
@@ -3534,6 +3560,15 @@ uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
     return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s);
 }
 
+uint8_t bfloat16_to_uint8_scalbn(bfloat16 a, FloatRoundMode rmode,
+                                 int scale, float_status *s)
+{
+    FloatParts64 p;
+
+    bfloat16_unpack_canonical(&p, a, s);
+    return parts_float_to_uint(&p, rmode, scale, UINT8_MAX, s);
+}
+
 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
                                    int scale, float_status *s)
 {
@@ -3759,6 +3794,11 @@ Int128 float128_to_uint128_round_to_zero(float128 a, float_status *s)
     return float128_to_uint128_scalbn(a, float_round_to_zero, 0, s);
 }
 
+uint8_t bfloat16_to_uint8(bfloat16 a, float_status *s)
+{
+    return bfloat16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
+}
+
 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
 {
     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
@@ -3774,6 +3814,11 @@ uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
 }
 
+uint8_t bfloat16_to_uint8_round_to_zero(bfloat16 a, float_status *s)
+{
+    return bfloat16_to_uint8_scalbn(a, float_round_to_zero, 0, s);
+}
+
 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
 {
     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
@@ -3929,6 +3974,11 @@ bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
     return int64_to_bfloat16_scalbn(a, scale, status);
 }
 
+bfloat16 int8_to_bfloat16_scalbn(int8_t a, int scale, float_status *status)
+{
+    return int64_to_bfloat16_scalbn(a, scale, status);
+}
+
 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
 {
     return int64_to_bfloat16_scalbn(a, 0, status);
@@ -3944,6 +3994,11 @@ bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
     return int64_to_bfloat16_scalbn(a, 0, status);
 }
 
+bfloat16 int8_to_bfloat16(int8_t a, float_status *status)
+{
+    return int64_to_bfloat16_scalbn(a, 0, status);
+}
+
 float128 int128_to_float128(Int128 a, float_status *status)
 {
     FloatParts128 p = { };
@@ -4139,6 +4194,11 @@ bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
     return uint64_to_bfloat16_scalbn(a, scale, status);
 }
 
+bfloat16 uint8_to_bfloat16_scalbn(uint8_t a, int scale, float_status *status)
+{
+    return uint64_to_bfloat16_scalbn(a, scale, status);
+}
+
 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
 {
     return uint64_to_bfloat16_scalbn(a, 0, status);
@@ -4154,6 +4214,11 @@ bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
     return uint64_to_bfloat16_scalbn(a, 0, status);
 }
 
+bfloat16 uint8_to_bfloat16(uint8_t a, float_status *status)
+{
+    return uint64_to_bfloat16_scalbn(a, 0, status);
+}
+
 float128 uint64_to_float128(uint64_t a, float_status *status)
 {
     FloatParts128 p;

diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
@@ -11,6 +11,7 @@
 #define CPUINFO_LSE2            (1u << 2)
 #define CPUINFO_AES             (1u << 3)
 #define CPUINFO_PMULL           (1u << 4)
+#define CPUINFO_BTI             (1u << 5)
 
 /* Initialized with a constructor. */
 extern unsigned cpuinfo;

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
@@ -100,12 +100,12 @@
 typedef struct CPUTLBEntryFull {
     /*
      * @xlat_section contains:
-     *  - in the lower TARGET_PAGE_BITS, a physical section number
-     *  - with the lower TARGET_PAGE_BITS masked off, an offset which
-     *    must be added to the virtual address to obtain:
-     *     + the ram_addr_t of the target RAM (if the physical section
-     *       number is PHYS_SECTION_NOTDIRTY or PHYS_SECTION_ROM)
-     *     + the offset within the target MemoryRegion (otherwise)
+     *  - For ram, an offset which must be added to the virtual address
+     *    to obtain the ram_addr_t of the target RAM
+     *  - For other memory regions,
+     *     + in the lower TARGET_PAGE_BITS, the physical section number
+     *     + with the TARGET_PAGE_BITS masked off, the offset within
+     *       the target MemoryRegion
      */
     hwaddr xlat_section;
 

diff --git a/include/exec/user/thunk.h b/include/exec/user/thunk.h
@@ -111,8 +111,7 @@ static inline int thunk_type_size(const argtype *type_ptr, int is_host)
         if (is_host) {
 #if defined(HOST_X86_64)
             return 8;
-#elif defined(HOST_ALPHA) || defined(HOST_IA64) || defined(HOST_MIPS) || \
-      defined(HOST_PARISC) || defined(HOST_SPARC64)
+#elif defined(HOST_MIPS) || defined(HOST_SPARC64)
             return 4;
 #elif defined(HOST_PPC)
             return sizeof(void *);

diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
@@ -366,46 +366,58 @@ float32 bfloat16_to_float32(bfloat16, float_status *status);
 bfloat16 float64_to_bfloat16(float64 a, float_status *status);
 float64 bfloat16_to_float64(bfloat16 a, float_status *status);
 
+int8_t bfloat16_to_int8_scalbn(bfloat16, FloatRoundMode,
+                               int, float_status *status);
 int16_t bfloat16_to_int16_scalbn(bfloat16, FloatRoundMode,
                                  int, float_status *status);
 int32_t bfloat16_to_int32_scalbn(bfloat16, FloatRoundMode,
                                  int, float_status *status);
 int64_t bfloat16_to_int64_scalbn(bfloat16, FloatRoundMode,
                                  int, float_status *status);
 
+int8_t bfloat16_to_int8(bfloat16, float_status *status);
 int16_t bfloat16_to_int16(bfloat16, float_status *status);
 int32_t bfloat16_to_int32(bfloat16, float_status *status);
 int64_t bfloat16_to_int64(bfloat16, float_status *status);
 
+int8_t bfloat16_to_int8_round_to_zero(bfloat16, float_status *status);
 int16_t bfloat16_to_int16_round_to_zero(bfloat16, float_status *status);
 int32_t bfloat16_to_int32_round_to_zero(bfloat16, float_status *status);
 int64_t bfloat16_to_int64_round_to_zero(bfloat16, float_status *status);
 
+uint8_t bfloat16_to_uint8_scalbn(bfloat16 a, FloatRoundMode,
+                                 int, float_status *status);
 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode,
                                    int, float_status *status);
 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode,
                                    int, float_status *status);
 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode,
                                    int, float_status *status);
 
+uint8_t bfloat16_to_uint8(bfloat16 a, float_status *status);
 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *status);
 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *status);
 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *status);
 
+uint8_t bfloat16_to_uint8_round_to_zero(bfloat16 a, float_status *status);
 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *status);
 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *status);
 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *status);
 
+bfloat16 int8_to_bfloat16_scalbn(int8_t a, int, float_status *status);
 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int, float_status *status);
 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int, float_status *status);
 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int, float_status *status);
+bfloat16 uint8_to_bfloat16_scalbn(uint8_t a, int, float_status *status);
 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int, float_status *status);
 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int, float_status *status);
 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int, float_status *status);
 
+bfloat16 int8_to_bfloat16(int8_t a, float_status *status);
 bfloat16 int16_to_bfloat16(int16_t a, float_status *status);
 bfloat16 int32_to_bfloat16(int32_t a, float_status *status);
 bfloat16 int64_to_bfloat16(int64_t a, float_status *status);
+bfloat16 uint8_to_bfloat16(uint8_t a, float_status *status);
 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status);
 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status);
 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status);

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
@@ -227,17 +227,6 @@ struct CPUWatchpoint {
     QTAILQ_ENTRY(CPUWatchpoint) entry;
 };
 
-#ifdef CONFIG_PLUGIN
-/*
- * For plugins we sometime need to save the resolved iotlb data before
- * the memory regions get moved around  by io_writex.
- */
-typedef struct SavedIOTLB {
-    MemoryRegionSection *section;
-    hwaddr mr_offset;
-} SavedIOTLB;
-#endif
-
 struct KVMState;
 struct kvm_run;
 
@@ -409,8 +398,6 @@ struct CPUState {
 
 #ifdef CONFIG_PLUGIN
     GArray *plugin_mem_cbs;
-    /* saved iotlb data from io_writex */
-    SavedIOTLB saved_iotlb;
 #endif
 
     /* TODO Move common fields from CPUArchState here. */