Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge tag 'pull-tcg-20230530' of https://gitlab.com/rth7680/qemu into…
… staging

Improvements to 128-bit atomics:
  - Separate __int128_t type and arithmetic detection
  - Support 128-bit load/store in backend for i386, aarch64, ppc64, s390x
  - Accelerate atomics via host/include/
Decodetree:
  - Add named field syntax
  - Move tests to meson

# -----BEGIN PGP SIGNATURE-----
#
# iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmR2R10dHHJpY2hhcmQu
# aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/bsgf/XLi8q+ITyoEAKwG4
# 6ML7DktLAdIs9Euah9twqe16U0BM0YzpKfymBfVVBKKaIa0524N4ZKIT3h6EeJo+
# f+ultqrpsnH+aQh4wc3ZCkEvRdhzhFT8VcoRTunJuJrbL3Y8n2ZSgODUL2a0tahT
# Nn+zEPm8rzQanSKQHq5kyNBLpgTUKjc5wKfvy/WwttnFmkTnqzcuEA6nPVOVwOHC
# lZBQCByIQWsHfFHUVJFvsFzBQbm0mAiW6FNKzPBkoXon0h/UZUI1lV+xXzgutFs+
# zR2O8IZwLYRu2wOWiTF8Nn2qQafkB3Dhwoq3JTEXhOqosOPExbIiWlsZDlPiKRJk
# bwmQlg==
# =XQMb
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 30 May 2023 11:58:37 AM PDT
# gpg:                using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg:                issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [ultimate]

* tag 'pull-tcg-20230530' of https://gitlab.com/rth7680/qemu: (27 commits)
  tests/decode: Add tests for various named-field cases
  scripts/decodetree: Implement named field support
  scripts/decodetree: Implement a topological sort
  scripts/decodetree: Pass lvalue-formatter function to str_extract()
  docs: Document decodetree named field syntax
  tests/decode: Convert tests to meson
  decodetree: Do not remove output_file from /dev
  decodetree: Diagnose empty pattern group
  decodetree: Fix recursion in prop_format and build_tree
  decodetree: Add --test-for-error
  tcg: Remove TCG_TARGET_TLB_DISPLACEMENT_BITS
  accel/tcg: Add aarch64 store_atom_insert_al16
  accel/tcg: Add aarch64 lse2 load_atom_extract_al16_or_al8
  accel/tcg: Add x86_64 load_atom_extract_al16_or_al8
  accel/tcg: Extract store_atom_insert_al16 to host header
  accel/tcg: Extract load_atom_extract_al16_or_al8 to host header
  tcg/s390x: Support 128-bit load/store
  tcg/ppc: Support 128-bit load/store
  tcg/aarch64: Support 128-bit load/store
  tcg/aarch64: Simplify constraints on qemu_ld/st
  ...

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
  • Loading branch information
rth7680 committed May 30, 2023
2 parents 7f027ee + 276d77d commit 51bdb0b
Show file tree
Hide file tree
Showing 38 changed files with 1,312 additions and 225 deletions.
80 changes: 5 additions & 75 deletions accel/tcg/ldst_atomicity.c.inc
Expand Up @@ -9,6 +9,9 @@
* See the COPYING file in the top-level directory.
*/

#include "host/load-extract-al16-al8.h"
#include "host/store-insert-al16.h"

#ifdef CONFIG_ATOMIC64
# define HAVE_al8 true
#else
Expand Down Expand Up @@ -156,7 +159,7 @@ static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
* another process, because the fallback start_exclusive solution
* provides no protection across processes.
*/
if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) {
if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
uint64_t *p = __builtin_assume_aligned(pv, 8);
return *p;
}
Expand Down Expand Up @@ -191,7 +194,7 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
* another process, because the fallback start_exclusive solution
* provides no protection across processes.
*/
if (!page_check_range(h2g(p), 16, PAGE_WRITE)) {
if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
return *p;
}
#endif
Expand Down Expand Up @@ -311,40 +314,6 @@ static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
return int128_getlo(r);
}

/**
* load_atom_extract_al16_or_al8:
* @p: host address
* @s: object size in bytes, @s <= 8.
*
* Load @s bytes from @p, when p % s != 0. If [p, p+s-1] does not
* cross an 16-byte boundary then the access must be 16-byte atomic,
* otherwise the access must be 8-byte atomic.
*/
static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
load_atom_extract_al16_or_al8(void *pv, int s)
{
uintptr_t pi = (uintptr_t)pv;
int o = pi & 7;
int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
Int128 r;

pv = (void *)(pi & ~7);
if (pi & 8) {
uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
uint64_t a = qatomic_read__nocheck(p8);
uint64_t b = qatomic_read__nocheck(p8 + 1);

if (HOST_BIG_ENDIAN) {
r = int128_make128(b, a);
} else {
r = int128_make128(a, b);
}
} else {
r = atomic16_read_ro(pv);
}
return int128_getlo(int128_urshift(r, shr));
}

/**
* load_atom_4_by_2:
* @pv: host address
Expand Down Expand Up @@ -713,45 +682,6 @@ static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
__ATOMIC_RELAXED, __ATOMIC_RELAXED));
}

/**
* store_atom_insert_al16:
* @p: host address
* @val: shifted value to store
* @msk: mask for value to store
*
* Atomically store @val to @p masked by @msk.
*/
static void ATTRIBUTE_ATOMIC128_OPT
store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
{
#if defined(CONFIG_ATOMIC128)
__uint128_t *pu, old, new;

/* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
pu = __builtin_assume_aligned(ps, 16);
old = *pu;
do {
new = (old & ~msk.u) | val.u;
} while (!__atomic_compare_exchange_n(pu, &old, new, true,
__ATOMIC_RELAXED, __ATOMIC_RELAXED));
#elif defined(CONFIG_CMPXCHG128)
__uint128_t *pu, old, new;

/*
* Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
* defer to libatomic, so we must use __sync_*_compare_and_swap_16
* and accept the sequential consistency that comes with it.
*/
pu = __builtin_assume_aligned(ps, 16);
do {
old = *pu;
new = (old & ~msk.u) | val.u;
} while (!__sync_bool_compare_and_swap_16(pu, old, new));
#else
qemu_build_not_reached();
#endif
}

/**
* store_bytes_leN:
* @pv: host address
Expand Down
33 changes: 28 additions & 5 deletions docs/devel/decodetree.rst
Expand Up @@ -23,22 +23,42 @@ Fields

Syntax::

field_def := '%' identifier ( unnamed_field )* ( !function=identifier )?
field_def := '%' identifier ( field )* ( !function=identifier )?
field := unnamed_field | named_field
unnamed_field := number ':' ( 's' ) number
named_field := identifier ':' ( 's' ) number

For *unnamed_field*, the first number is the least-significant bit position
of the field and the second number is the length of the field. If the 's' is
present, the field is considered signed. If multiple ``unnamed_fields`` are
present, they are concatenated. In this way one can define disjoint fields.
present, the field is considered signed.

A *named_field* refers to some other field in the instruction pattern
or format. Regardless of the length of the other field where it is
defined, it will be inserted into this field with the specified
signedness and bit width.

Field definitions that involve loops (i.e. where a field is defined
directly or indirectly in terms of itself) are errors.

A format can include fields that refer to named fields that are
defined in the instruction pattern(s) that use the format.
Conversely, an instruction pattern can include fields that refer to
named fields that are defined in the format it uses. However you
cannot currently do both at once (i.e. pattern P uses format F; F has
a field A that refers to a named field B that is defined in P, and P
has a field C that refers to a named field D that is defined in F).

If multiple ``fields`` are present, they are concatenated.
In this way one can define disjoint fields.

If ``!function`` is specified, the concatenated result is passed through the
named function, taking and returning an integral value.

One may use ``!function`` with zero ``unnamed_fields``. This case is called
One may use ``!function`` with zero ``fields``. This case is called
a *parameter*, and the named function is only passed the ``DisasContext``
and returns an integral value extracted from there.

A field with no ``unnamed_fields`` and no ``!function`` is in error.
A field with no ``fields`` and no ``!function`` is in error.

Field examples:

Expand All @@ -56,6 +76,9 @@ Field examples:
| %shimm8 5:s8 13:1 | expand_shimm8(sextract(i, 5, 8) << 1 | |
| !function=expand_shimm8 | extract(i, 13, 1)) |
+---------------------------+---------------------------------------------+
| %sz_imm 10:2 sz:3 | expand_sz_imm(extract(i, 10, 2) << 3 | |
| !function=expand_sz_imm | extract(a->sz, 0, 3)) |
+---------------------------+---------------------------------------------+

Argument Sets
=============
Expand Down
40 changes: 40 additions & 0 deletions host/include/aarch64/host/load-extract-al16-al8.h
@@ -0,0 +1,40 @@
/*
* SPDX-License-Identifier: GPL-2.0-or-later
* Atomic extract 64 from 128-bit, AArch64 version.
*
* Copyright (C) 2023 Linaro, Ltd.
*/

#ifndef AARCH64_LOAD_EXTRACT_AL16_AL8_H
#define AARCH64_LOAD_EXTRACT_AL16_AL8_H

#include "host/cpuinfo.h"
#include "tcg/debug-assert.h"

/**
* load_atom_extract_al16_or_al8:
* @pv: host address
* @s: object size in bytes, @s <= 8.
*
* Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
* cross an 16-byte boundary then the access must be 16-byte atomic,
* otherwise the access must be 8-byte atomic.
*/
static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
{
uintptr_t pi = (uintptr_t)pv;
__int128_t *ptr_align = (__int128_t *)(pi & ~7);
int shr = (pi & 7) * 8;
uint64_t l, h;

/*
* With FEAT_LSE2, LDP is single-copy atomic if 16-byte aligned
* and single-copy atomic on the parts if 8-byte aligned.
* All we need do is align the pointer mod 8.
*/
tcg_debug_assert(HAVE_ATOMIC128_RO);
asm("ldp %0, %1, %2" : "=r"(l), "=r"(h) : "m"(*ptr_align));
return (l >> shr) | (h << (-shr & 63));
}

#endif /* AARCH64_LOAD_EXTRACT_AL16_AL8_H */
47 changes: 47 additions & 0 deletions host/include/aarch64/host/store-insert-al16.h
@@ -0,0 +1,47 @@
/*
* SPDX-License-Identifier: GPL-2.0-or-later
* Atomic store insert into 128-bit, AArch64 version.
*
* Copyright (C) 2023 Linaro, Ltd.
*/

#ifndef AARCH64_STORE_INSERT_AL16_H
#define AARCH64_STORE_INSERT_AL16_H

/**
* store_atom_insert_al16:
* @p: host address
* @val: shifted value to store
* @msk: mask for value to store
*
* Atomically store @val to @p masked by @msk.
*/
static inline void ATTRIBUTE_ATOMIC128_OPT
store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
{
/*
* GCC only implements __sync* primitives for int128 on aarch64.
* We can do better without the barriers, and integrating the
* arithmetic into the load-exclusive/store-conditional pair.
*/
uint64_t tl, th, vl, vh, ml, mh;
uint32_t fail;

qemu_build_assert(!HOST_BIG_ENDIAN);
vl = int128_getlo(val);
vh = int128_gethi(val);
ml = int128_getlo(msk);
mh = int128_gethi(msk);

asm("0: ldxp %[l], %[h], %[mem]\n\t"
"bic %[l], %[l], %[ml]\n\t"
"bic %[h], %[h], %[mh]\n\t"
"orr %[l], %[l], %[vl]\n\t"
"orr %[h], %[h], %[vh]\n\t"
"stxp %w[f], %[l], %[h], %[mem]\n\t"
"cbnz %w[f], 0b\n"
: [mem] "+Q"(*ps), [f] "=&r"(fail), [l] "=&r"(tl), [h] "=&r"(th)
: [vl] "r"(vl), [vh] "r"(vh), [ml] "r"(ml), [mh] "r"(mh));
}

#endif /* AARCH64_STORE_INSERT_AL16_H */
45 changes: 45 additions & 0 deletions host/include/generic/host/load-extract-al16-al8.h
@@ -0,0 +1,45 @@
/*
* SPDX-License-Identifier: GPL-2.0-or-later
* Atomic extract 64 from 128-bit, generic version.
*
* Copyright (C) 2023 Linaro, Ltd.
*/

#ifndef HOST_LOAD_EXTRACT_AL16_AL8_H
#define HOST_LOAD_EXTRACT_AL16_AL8_H

/**
* load_atom_extract_al16_or_al8:
* @pv: host address
* @s: object size in bytes, @s <= 8.
*
* Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
* cross an 16-byte boundary then the access must be 16-byte atomic,
* otherwise the access must be 8-byte atomic.
*/
static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
load_atom_extract_al16_or_al8(void *pv, int s)
{
uintptr_t pi = (uintptr_t)pv;
int o = pi & 7;
int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
Int128 r;

pv = (void *)(pi & ~7);
if (pi & 8) {
uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
uint64_t a = qatomic_read__nocheck(p8);
uint64_t b = qatomic_read__nocheck(p8 + 1);

if (HOST_BIG_ENDIAN) {
r = int128_make128(b, a);
} else {
r = int128_make128(a, b);
}
} else {
r = atomic16_read_ro(pv);
}
return int128_getlo(int128_urshift(r, shr));
}

#endif /* HOST_LOAD_EXTRACT_AL16_AL8_H */
50 changes: 50 additions & 0 deletions host/include/generic/host/store-insert-al16.h
@@ -0,0 +1,50 @@
/*
* SPDX-License-Identifier: GPL-2.0-or-later
* Atomic store insert into 128-bit, generic version.
*
* Copyright (C) 2023 Linaro, Ltd.
*/

#ifndef HOST_STORE_INSERT_AL16_H
#define HOST_STORE_INSERT_AL16_H

/**
* store_atom_insert_al16:
* @p: host address
* @val: shifted value to store
* @msk: mask for value to store
*
* Atomically store @val to @p masked by @msk.
*/
static inline void ATTRIBUTE_ATOMIC128_OPT
store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
{
#if defined(CONFIG_ATOMIC128)
__uint128_t *pu;
Int128Alias old, new;

/* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
pu = __builtin_assume_aligned(ps, 16);
old.u = *pu;
msk = int128_not(msk);
do {
new.s = int128_and(old.s, msk);
new.s = int128_or(new.s, val);
} while (!__atomic_compare_exchange_n(pu, &old.u, new.u, true,
__ATOMIC_RELAXED, __ATOMIC_RELAXED));
#else
Int128 old, new, cmp;

ps = __builtin_assume_aligned(ps, 16);
old = *ps;
msk = int128_not(msk);
do {
cmp = old;
new = int128_and(old, msk);
new = int128_or(new, val);
old = atomic16_cmpxchg(ps, cmp, new);
} while (int128_ne(cmp, old));
#endif
}

#endif /* HOST_STORE_INSERT_AL16_H */

0 comments on commit 51bdb0b

Please sign in to comment.