Skip to content

Commit

Permalink
amd64: WIP start implementing the amd64 SHA extensions
Browse files Browse the repository at this point in the history
since 2017 on AMD (Epyc) and Intel (since Goldmont).
See bugzilla #398545
  • Loading branch information
rurban committed Sep 3, 2020
1 parent c073ac7 commit f0fc15e
Show file tree
Hide file tree
Showing 8 changed files with 228 additions and 2 deletions.
2 changes: 2 additions & 0 deletions VEX/priv/guest_amd64_defs.h
Expand Up @@ -189,6 +189,8 @@ extern void amd64g_dirtyhelper_OUT ( ULong portno, ULong data,

extern void amd64g_dirtyhelper_SxDT ( void* address,
ULong op /* 0 or 1 */ );
extern void amd64g_dirtyhelper_SHAx ( V128 *out, V128 m0, V128 m1,
ULong op /* opcode part */ );

// This returns a 32-bit value from the host's RDRAND in bits 31:0, and the
// resulting C flag value in bit 32.
Expand Down
177 changes: 177 additions & 0 deletions VEX/priv/guest_amd64_helpers.c
Expand Up @@ -3449,6 +3449,147 @@ void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st,
# undef SET_ABCD
}

/* Claim to be the following baseline AMD Ryzen CPU (2 x ...), which is SHA_NI capable.
vendor_id : AuthenticAMD
cpu family : 23
model : 24
model name : AMD Ryzen 3 3200U with Radeon Vega Mobile Gfx
stepping : 1
microcode : 0x8108102
cpu MHz : 1681.759
cache size : 512 KB
physical id : 0
siblings : 4
core id : 1
cpu cores : 2
apicid : 3
initial apicid : 3
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate sme ssbd sev ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca
bugs : sysret_ss_attrs null_seg spectre_v1 spectre_v2 spec_store_bypass
bogomips : 5190.28
TLB size : 2560 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 43 bits physical, 48 bits virtual
power management: ts ttp tm hwpstate eff_freq_ro [13] [14]
sha_ni is in ebx, bit 29
*/
void amd64g_dirtyhelper_CPUID_sha ( VexGuestAMD64State* st,
ULong hasF16C, ULong hasRDRAND )
{
vassert((hasF16C >> 1) == 0ULL);
vassert((hasRDRAND >> 1) == 0ULL);
# define SET_ABCD(_a,_b,_c,_d) \
do { st->guest_RAX = (ULong)(_a); \
st->guest_RBX = (ULong)(_b); \
st->guest_RCX = (ULong)(_c); \
st->guest_RDX = (ULong)(_d); \
} while (0)

UInt old_eax = (UInt)st->guest_RAX;
UInt old_ecx = (UInt)st->guest_RCX;

switch (old_eax) {
case 0x00000000:
SET_ABCD(0x0000000d, 0x68747541, 0x444d4163, 0x69746e65);
break;
case 0x00000001: {
// As a baseline, advertise neither F16C (ecx:29) nor RDRAND (ecx:30),
// but patch in support for them as directed by the caller.
UInt ecx_extra
= (hasF16C ? (1U << 29) : 0) | (hasRDRAND ? (1U << 30) : 0);
SET_ABCD(0x00810f81, 0x01040800, (0x7ed8320b | ecx_extra), 0x178bfbff);
break;
}
case 0x00000005:
SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000011);
break;
case 0x00000006:
SET_ABCD(0x00000004, 0x00000000, 0x00000001, 0x00000000);
break;
case 0x00000007:
if (old_ecx = 0x0)
SET_ABCD(0x00000000, 0x209c01a9, 0x00000000, 0x00000000);
else
SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
break;
case 0x0000000d:
switch (old_ecx) {
case 0x00000000:
SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
break;
case 0x00000001:
SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
break;
case 0x00000002:
SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
break;
default:
SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
break;
}
break;
case 0x80000000:
SET_ABCD(0x8000001f, 0x68747541, 0x444d4163, 0x69746e65);
break;
case 0x80000001:
SET_ABCD(0x00810f81, 0x00000000, 0x35c233ff, 0x2fd3fbff);
break;
case 0x80000002:
SET_ABCD(0x20444d41, 0x657a7952, 0x2033206e, 0x30303233);
break;
case 0x80000003:
SET_ABCD(0x69772055, 0x52206874, 0x6f656461, 0x6556206e);
break;
case 0x80000004:
SET_ABCD(0x4d206167, 0x6c69626f, 0x66472065, 0x00202078);
break;
case 0x80000005:
SET_ABCD(0xff40ff40, 0xff40ff40, 0x20080140, 0x40040140 );
break;
case 0x80000006:
SET_ABCD(0x26006400, 0x66006400, 0x02006140, 0x00208140 );
break;
case 0x80000007:
SET_ABCD(0x00000000, 0x0000001b, 0x00000000, 0x00006599 );
break;
case 0x80000008:
SET_ABCD(0x00003030, 0x00001007, 0x00004003, 0x00000000 );
break;
case 0x8000000a:
SET_ABCD(0x00000001, 0x00008000, 0x00000000, 0x0001bcff);
break;
case 0x80000019:
SET_ABCD(0xf040f040, 0x00000000, 0x00000000, 0x00000000);
break;
case 0x8000001a:
SET_ABCD(0x00000003, 0x00000000, 0x00000000, 0x00000000);
break;
case 0x8000001b:
SET_ABCD(0x000003ff, 0x00000000, 0x00000000, 0x00000000);
break;
case 0x8000001d:
SET_ABCD(0x00004121, 0x01c0003f, 0x0000003f, 0x00000000);
break;
case 0x8000001e:
SET_ABCD(0x00000001, 0x00000100, 0x00000000, 0x00000000);
break;
case 0x8000001f:
SET_ABCD(0x0000000f, 0x0000016f, 0x0000000f, 0x00000000);
break;
default:
SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
break;
}
# undef SET_ABCD
}


/*---------------------------------------------------------------*/
/*--- Misc integer helpers, including rotates and crypto. ---*/
Expand Down Expand Up @@ -3753,6 +3894,42 @@ void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
# endif
}

/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack. On non-amd64 platforms, do nothing. See dis_ESC_0F38() */
void amd64g_dirtyhelper_SHAx ( V128 *out, V128 m0, V128 m1, ULong op ) {
# if 0 && defined(__x86_64__)
switch (op) {
case 0xC8:
__asm__ __volatile__("sha1nexte (%0, %1)" : : "r" (m0) : "memory");
break;
case 0xC9:
__asm__ __volatile__("sha1msg1 (%0, %1)" : : "r" (m0) : "memory");
break;
case 0xCA:
__asm__ __volatile__("sha1msg (%0, %1)" : : "r" (m0) : "memory");
break;
case 0x3ACC:
__asm__ __volatile__("sha1rnds4 (%0, %1, %%xmm0)" : : "r" (m0) : "memory");
break;
case 0xCB:
__asm__ __volatile__("sha256rnds2 (%0, %1, %%xmm0)" : : "r" (m0) : "memory");
break;
case 0xCC:
__asm__ __volatile__("sha256msg1 (%0, %1)" : : "r" (m0) : "memory");
break;
case 0xCD:
__asm__ __volatile__("sha256msg2 (%0, %1)" : : "r" (m0) : "memory");
break;
default:
vpanic("amd64g_dirtyhelper_SHAx");
}
# else
/* do nothing */
((ULong*)out)[0] = 0UL;
((ULong*)out)[1] = 0UL;
# endif
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack. On non-amd64 platforms, do nothing. On amd64 targets, get a
Expand Down
21 changes: 21 additions & 0 deletions VEX/priv/guest_amd64_toIR.c
Expand Up @@ -22622,6 +22622,19 @@ Long dis_ESC_0F38 (
delta++;
switch (opc) {

case 0xC8: /* 0F 38 C8 = SHA1NEXTE m128, m128 */
case 0xC9: /* 0F 38 C9 = SHA1MSG1 m128, m128 */
case 0xCA: /* 0F 38 CA = SHA1MSG2 m128, m128 */
case 0xCB: /* 0F 38 CB = SHA256RNDS2 m128, m128, xmm0 */
case 0xCC: /* 0F 38 CC = SHA256MSG1 m128, m128 */
case 0xCD: /* 0F 38 CD = SHA256MSG1 m128, m128 */
/* FIXME: */
if (have66noF2noF3(pfx) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_SHA)) {
return 3;
}
return 3;
break;

case 0xF0: /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
if (!haveF2orF3(pfx) && !haveVEX(pfx)
Expand Down Expand Up @@ -32005,6 +32018,14 @@ Long dis_ESC_0F3A__VEX (
}
break;

case 0xCC: /* 0F 3A CC = SHA1RNDS4 m128, m128, imm8 */
/* FIXME: */
if (have66noF2noF3(pfx) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_SHA)) {
delta = 3;
goto decode_success;
}
break;

case 0xDF:
/* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
Expand Down
1 change: 0 additions & 1 deletion VEX/priv/host_amd64_defs.c
@@ -1,4 +1,3 @@

/*---------------------------------------------------------------*/
/*--- begin host_amd64_defs.c ---*/
/*---------------------------------------------------------------*/
Expand Down
1 change: 1 addition & 0 deletions VEX/priv/host_amd64_isel.c
Expand Up @@ -5303,6 +5303,7 @@ HInstrArray* iselSB_AMD64 ( const IRSB* bb,
| VEX_HWCAPS_AMD64_BMI
| VEX_HWCAPS_AMD64_AVX2
| VEX_HWCAPS_AMD64_F16C
| VEX_HWCAPS_AMD64_SHA
| VEX_HWCAPS_AMD64_RDRAND)));

/* Check that the host's endianness is as expected. */
Expand Down
5 changes: 5 additions & 0 deletions VEX/priv/main_main.c
Expand Up @@ -1648,6 +1648,7 @@ static const HChar* show_hwcaps_amd64 ( UInt hwcaps )
{ VEX_HWCAPS_AMD64_BMI, "bmi" },
{ VEX_HWCAPS_AMD64_F16C, "f16c" },
{ VEX_HWCAPS_AMD64_RDRAND, "rdrand" },
{ VEX_HWCAPS_AMD64_SHA, "sha_ni" },
};
/* Allocate a large enough buffer */
static HChar buf[sizeof prefix +
Expand Down Expand Up @@ -1960,6 +1961,7 @@ static void check_hwcaps ( VexArch arch, UInt hwcaps )
Bool have_avx = (hwcaps & VEX_HWCAPS_AMD64_AVX) != 0;
Bool have_bmi = (hwcaps & VEX_HWCAPS_AMD64_BMI) != 0;
Bool have_avx2 = (hwcaps & VEX_HWCAPS_AMD64_AVX2) != 0;
Bool have_sha = (hwcaps & VEX_HWCAPS_AMD64_SHA) != 0;

/* SSSE3 without SSE3 */
if (have_ssse3 && !have_sse3)
Expand All @@ -1976,6 +1978,9 @@ static void check_hwcaps ( VexArch arch, UInt hwcaps )
if (have_bmi && !have_avx)
invalid_hwcaps(arch, hwcaps,
"Support for BMI requires AVX capabilities\n");
if (have_sha && !have_avx)
invalid_hwcaps(arch, hwcaps,
"Support for SHA_NI requires AVX capabilities\n");
return;
}

Expand Down
3 changes: 2 additions & 1 deletion VEX/pub/libvex.h
Expand Up @@ -99,7 +99,8 @@ typedef
#define VEX_HWCAPS_AMD64_BMI (1<<10) /* BMI1 instructions */
#define VEX_HWCAPS_AMD64_AVX2 (1<<11) /* AVX2 instructions */
#define VEX_HWCAPS_AMD64_RDRAND (1<<13) /* RDRAND instructions */
#define VEX_HWCAPS_AMD64_F16C (1<<14) /* F16C instructions */
#define VEX_HWCAPS_AMD64_F16C (1<<14) /* F16C instructions (ecx, bit 29) */
#define VEX_HWCAPS_AMD64_SHA (1<<15) /* SHA instructions (ebx, bit 29) */

/* ppc32: baseline capability is integer only */
#define VEX_HWCAPS_PPC32_F (1<<8) /* basic (non-optional) FP */
Expand Down
20 changes: 20 additions & 0 deletions configure.ac
Expand Up @@ -3058,6 +3058,26 @@ AC_MSG_RESULT([no])

AM_CONDITIONAL(BUILD_F16C_TESTS, test x$ac_have_as_f16c = xyes)

# does the amd64 assembler understand SHA1MSG1 (from SHA-NI)?
# Note, this doesn't generate a C-level symbol. It generates a
# automake-level symbol (BUILD_SHA_TESTS), used in test Makefile.am's
# TODO: arm (SHA1M Qd, Sn, Vm.4S), power8
AC_MSG_CHECKING([if amd64 assembler knows the SHA insn])

AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[
do { long long int x;
__asm__ __volatile__(
"sha1msg1 %%xmm2, %%xmm1" : : : "xmm2", "xmm1" ); }
while (0)
]])], [
ac_have_as_sha=yes
AC_MSG_RESULT([yes])
], [
ac_have_as_sha=no
AC_MSG_RESULT([no])
])

AM_CONDITIONAL(BUILD_SHA_TESTS, test x$ac_have_as_sha = xyes)

# does the x86/amd64 assembler understand MOVBE?
# Note, this doesn't generate a C-level symbol. It generates a
Expand Down

1 comment on commit f0fc15e

@knight110001
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have tested this patch based on 3.16.1, and get the error as following, it seems that the return value of dis_ESC_0F38 is not right yet.

29 vex: priv/guest_amd64_toIR.c:32446 (disInstr_AMD64_WRK): Assertion `delta - delta_at_primary_opcode >= 0' failed.
30 vex storage: T total 1732979880 bytes allocated
31 vex storage: P total 512 bytes allocated
32
33 valgrind: the 'impossible' happened:
34 LibVEX called failure_exit().
35
36 host stacktrace:
37 ==1997== at 0x58009EFA: show_sched_status_wrk (m_libcassert.c:406)
38 ==1997== by 0x5800A017: report_and_quit (m_libcassert.c:477)
39 ==1997== by 0x5800A26C: panic (m_libcassert.c:553)
40 ==1997== by 0x5800A26C: vgPlain_core_panic_at (m_libcassert.c:558)
41 ==1997== by 0x5800A28A: vgPlain_core_panic (m_libcassert.c:563)
42 ==1997== by 0x5808DFB4: failure_exit (m_translate.c:761)
43 ==1997== by 0x5810C04A: vex_assert_fail (main_util.c:245)
44 ==1997== by 0x581A3921: disInstr_AMD64_WRK (guest_amd64_toIR.c:32446)
45 ==1997== by 0x581A3CCF: disInstr_AMD64 (guest_amd64_toIR.c:32562)
46 ==1997== by 0x58127739: disassemble_basic_block_till_stop (guest_generic_bb_to_IR.c:954)
47 ==1997== by 0x58128784: bb_to_IR (guest_generic_bb_to_IR.c:1363)
48 ==1997== by 0x58109814: LibVEX_FrontEnd (main_main.c:583)
49 ==1997== by 0x5810A182: LibVEX_Translate (main_main.c:1235)
50 ==1997== by 0x58090A43: vgPlain_translate (m_translate.c:1828)
51 ==1997== by 0x5805DC0F: handle_tt_miss (scheduler.c:1138)
52 ==1997== by 0x5805DC0F: vgPlain_scheduler (scheduler.c:1500)
53 ==1997== by 0x580BD2C0: thread_wrapper (syswrap-linux.c:101)
54 ==1997== by 0x580BD2C0: run_a_thread_NORETURN (syswrap-linux.c:154)
55 ==1997== by 0x580BD5AA: vgModuleLocal_start_thread_NORETURN (syswrap-linux.c:328)
56 ==1997== by 0x5806EDDD: ??? (in /usr/local/lib/valgrind/massif-amd64-linux)

BTW, my original issue on "AMD EPYC 7302P 16-Core Processor" is as following,
28 vex amd64->IR: unhandled instruction bytes: 0xF 0x38 0xCC 0xFA 0xF 0x38 0xCB 0xD9 0xC5 0xF9
29 vex amd64->IR: REX=0 REX.W=0 REX.R=0 REX.X=0 REX.B=0
30 vex amd64->IR: VEX=0 VEX.L=0 VEX.nVVVV=0x0 ESC=0F38
31 vex amd64->IR: PFX.66=0 PFX.F2=0 PFX.F3=0
32 ==10543== valgrind: Unrecognised instruction at address 0x1b9fbcc.
33 ==10543== at 0x1B9FBCC: _mm_sha256msg1_epu32 (sha.rs:100)

Please sign in to comment.